Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Implement Intl.Segmenter #2840

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions boa_engine/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ intl = [
"dep:icu_collator",
"dep:icu_casemapping",
"dep:icu_list",
"dep:icu_segmenter",
"dep:writeable",
"dep:sys-locale",
"dep:yoke",
Expand Down Expand Up @@ -75,6 +76,7 @@ dashmap = "5.4.0"
num_enum = "0.6.1"
pollster = "0.3.0"
thin-vec = "0.2.12"
itertools = { version = "0.10.5", default-features = false }

# intl deps
boa_icu_provider = { workspace = true, optional = true }
Expand All @@ -87,6 +89,7 @@ icu_plurals = { version = "1.2.0", features = ["serde"], optional = true }
icu_provider = { version = "1.2.0", optional = true }
icu_list = { version = "1.2.0", features = ["serde"], optional = true }
icu_casemapping = { version = "0.7.2", features = ["serde"], optional = true}
icu_segmenter = { version = "1.2.1", features = ["serde"], optional = true }
writeable = { version = "0.5.2", optional = true }
yoke = { version = "0.7.1", optional = true }
zerofrom = { version = "0.1.2", optional = true }
Expand Down
35 changes: 28 additions & 7 deletions boa_engine/src/builtins/intl/locale/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use icu_locid::{
};
use icu_locid_transform::LocaleCanonicalizer;
use icu_provider::{DataLocale, DataProvider, DataRequest, DataRequestMetadata, KeyedDataMarker};
use icu_segmenter::provider::WordBreakDataV1Marker;
use indexmap::IndexSet;

use tap::TapOptional;
Expand Down Expand Up @@ -115,9 +116,14 @@ pub(crate) fn canonicalize_locale_list(
// iv. Else,
else {
// 1. Let tag be ? ToString(kValue).
let k_value = k_value.to_string(context)?.to_std_string_escaped();
if k_value.contains('_') {
return Err(JsNativeError::range()
.with_message("locale is not a structurally valid language tag")
.into());
}

k_value
.to_string(context)?
.to_std_string_escaped()
.parse()
// v. If IsStructurallyValidLanguageTag(tag) is false, throw a RangeError exception.
.map_err(|_| {
Expand Down Expand Up @@ -169,7 +175,11 @@ pub(crate) fn best_available_locale<M: KeyedDataMarker>(
provider,
DataRequest {
locale: &candidate,
metadata: DataRequestMetadata::default(),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
metadata
},
},
);

Expand All @@ -180,10 +190,15 @@ pub(crate) fn best_available_locale<M: KeyedDataMarker>(
// the fallback algorithm, even if the used locale is exactly the same as the required
// locale.
match req.metadata.locale {
// TODO: ugly hack to accept locales that fallback to "und" in the collator/segmenter services
Some(loc)
if loc == candidate
// TODO: ugly hack to accept locales that fallback to "und" in the collator service
|| (loc.is_empty() && M::KEY.path() == CollationMetadataV1Marker::KEY.path()) =>
|| (loc.is_empty()
&& [
CollationMetadataV1Marker::KEY.path(),
WordBreakDataV1Marker::KEY.path(),
]
.contains(&M::KEY.path())) =>
{
return Some(candidate.into_locale().id)
}
Expand Down Expand Up @@ -242,8 +257,14 @@ pub(crate) fn best_locale_for_provider<M: KeyedDataMarker>(
.metadata
.locale
.map(|dl| {
// TODO: ugly hack to accept locales that fallback to "und" in the collator service
if M::KEY.path() == CollationMetadataV1Marker::KEY.path() && dl.is_empty() {
// TODO: ugly hack to accept locales that fallback to "und" in the collator/segmenter services
if [
CollationMetadataV1Marker::KEY.path(),
WordBreakDataV1Marker::KEY.path(),
]
.contains(&M::KEY.path())
&& dl.is_empty()
{
candidate.clone()
} else {
dl.into_locale().id
Expand Down
152 changes: 152 additions & 0 deletions boa_engine/src/builtins/intl/segmenter/iterator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
use boa_gc::{Finalize, Trace};
use boa_profiler::Profiler;
use icu_segmenter::{
GraphemeClusterBreakIteratorUtf16, SentenceBreakIteratorUtf16, WordBreakIteratorUtf16,
};

use crate::{
builtins::{iterable::create_iter_result_object, BuiltInBuilder, IntrinsicObject},
context::intrinsics::Intrinsics,
js_string,
object::ObjectData,
property::Attribute,
realm::Realm,
Context, JsNativeError, JsObject, JsResult, JsString, JsSymbol, JsValue,
};

use super::create_segment_data_object;

pub(crate) enum NativeSegmentIterator<'l, 's> {
Grapheme(GraphemeClusterBreakIteratorUtf16<'l, 's>),
Word(WordBreakIteratorUtf16<'l, 's>),
Sentence(SentenceBreakIteratorUtf16<'l, 's>),
}

impl Iterator for NativeSegmentIterator<'_, '_> {
type Item = usize;

fn next(&mut self) -> Option<Self::Item> {
match self {
NativeSegmentIterator::Grapheme(g) => g.next(),
NativeSegmentIterator::Word(w) => w.next(),
NativeSegmentIterator::Sentence(s) => s.next(),
}
}
}

impl NativeSegmentIterator<'_, '_> {
/// If the iterator is a word break iterator, returns `Some(true)` when the segment preceding
/// the current boundary is word-like.
pub(crate) fn is_word_like(&self) -> Option<bool> {
if let Self::Word(w) = self {
Some(w.is_word_like())
} else {
None
}
}
}

#[derive(Debug, Trace, Finalize)]
pub struct SegmentIterator {
segmenter: JsObject,
string: JsString,
next_segment_index: usize,
}

impl IntrinsicObject for SegmentIterator {
fn init(realm: &Realm) {
let _timer = Profiler::global().start_event("%SegmentIteratorPrototype%", "init");

BuiltInBuilder::with_intrinsic::<Self>(realm)
.static_property(
JsSymbol::to_string_tag(),
js_string!("Segmenter String Iterator"),
Attribute::CONFIGURABLE,
)
.static_method(Self::next, js_string!("next"), 0)
.build();
}

fn get(intrinsics: &Intrinsics) -> JsObject {
intrinsics.objects().iterator_prototypes().segment()
}
}

impl SegmentIterator {
/// [`CreateSegmentIterator ( segmenter, string )`][spec]
///
/// [spec]: https://tc39.es/ecma402/#sec-createsegmentiterator
pub(crate) fn create(
segmenter: JsObject,
string: JsString,
context: &mut Context<'_>,
) -> JsObject {
// 1. Let internalSlotsList be « [[IteratingSegmenter]], [[IteratedString]], [[IteratedStringNextSegmentCodeUnitIndex]] ».
// 2. Let iterator be OrdinaryObjectCreate(%SegmentIteratorPrototype%, internalSlotsList).
// 3. Set iterator.[[IteratingSegmenter]] to segmenter.
// 4. Set iterator.[[IteratedString]] to string.
// 5. Set iterator.[[IteratedStringNextSegmentCodeUnitIndex]] to 0.
// 6. Return iterator.
JsObject::from_proto_and_data(
context
.intrinsics()
.objects()
.iterator_prototypes()
.segment(),
ObjectData::segment_iterator(Self {
segmenter,
string,
next_segment_index: 0,
}),
)
}
/// [`%SegmentIteratorPrototype%.next ( )`][spec]
///
/// [spec]: https://tc39.es/ecma402/#sec-%segmentiteratorprototype%.next
fn next(this: &JsValue, _: &[JsValue], context: &mut Context<'_>) -> JsResult<JsValue> {
// 1. Let iterator be the this value.
// 2. Perform ? RequireInternalSlot(iterator, [[IteratingSegmenter]]).
let mut iter = this.as_object().map(JsObject::borrow_mut).ok_or_else(|| {
JsNativeError::typ()
.with_message("`next` can only be called on a `Segment Iterator` object")
})?;
let iter = iter.as_segment_iterator_mut().ok_or_else(|| {
JsNativeError::typ()
.with_message("`next` can only be called on a `Segment Iterator` object")
})?;

// 5. Let startIndex be iterator.[[IteratedStringNextSegmentCodeUnitIndex]].
let start = iter.next_segment_index;

// 4. Let string be iterator.[[IteratedString]].
// 6. Let endIndex be ! FindBoundary(segmenter, string, startIndex, after).
let Some((end, is_word_like)) = iter.string.get(start..).and_then(|string| {
// 3. Let segmenter be iterator.[[IteratingSegmenter]].
let segmenter = iter.segmenter.borrow();
let segmenter = segmenter
.as_segmenter()
.expect("segment iterator object should contain a segmenter");
let mut segments = segmenter.native.segment(string);
// the first elem is always 0.
segments.next();
segments.next().map(|end| (start + end, segments.is_word_like()))
}) else {
// 7. If endIndex is not finite, then
// a. Return CreateIterResultObject(undefined, true).
return Ok(create_iter_result_object(JsValue::undefined(), true, context));
};
// 8. Set iterator.[[IteratedStringNextSegmentCodeUnitIndex]] to endIndex.
iter.next_segment_index = end;

// 9. Let segmentData be ! CreateSegmentDataObject(segmenter, string, startIndex, endIndex).
let segment_data =
create_segment_data_object(iter.string.clone(), start..end, is_word_like, context);

// 10. Return CreateIterResultObject(segmentData, false).
Ok(create_iter_result_object(
segment_data.into(),
false,
context,
))
}
}
Loading