diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 49318e1c..1370e03e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -29,6 +29,7 @@ jobs: - "--features no_object" - "--features no_function" - "--features no_module" + - "--features unicode-xid-ident" toolchain: [stable] experimental: [false] include: diff --git a/Cargo.toml b/Cargo.toml index 2f2ab525..f049777e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ include = [ "Cargo.toml" ] keywords = [ "scripting" ] -categories = [ "no-std", "embedded", "parser-implementations" ] +categories = [ "no-std", "embedded", "wasm", "parser-implementations" ] [dependencies] num-traits = { version = "0.2.11", default-features = false } @@ -34,6 +34,7 @@ no_object = [] # no custom objects no_function = [] # no script-defined functions no_module = [] # no modules internals = [] # expose internal data structures +unicode-xid-ident = ["unicode-xid"] # allow Unicode Standard Annex #31 for identifiers. # compiling for no-std no_std = [ "num-traits/libm", "hashbrown", "core-error", "libm", "ahash" ] @@ -73,6 +74,11 @@ default_features = false features = ["derive", "alloc"] optional = true +[dependencies.unicode-xid] +version = "0.2.1" +default_features = false +optional = true + [target.'cfg(target_arch = "wasm32")'.dependencies] instant= { version = "0.1.4", features = ["wasm-bindgen"] } # WASM implementation of std::time::Instant diff --git a/RELEASES.md b/RELEASES.md index bd4623c0..7f59cddf 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -20,6 +20,7 @@ New features * Custom syntax now works even without the `internals` feature. * Currying of function pointers is supported via the new `curry` keyword. * `Module::set_indexer_get_set_fn` is added as a shorthand of both `Module::set_indexer_get_fn` and `Module::set_indexer_set_fn`. +* New `unicode-xid-ident` feature to allow [Unicode Standard Annex #31](http://www.unicode.org/reports/tr31/) for identifiers. Breaking changes ---------------- diff --git a/doc/src/language/variables.md b/doc/src/language/variables.md index be4abbce..455c075b 100644 --- a/doc/src/language/variables.md +++ b/doc/src/language/variables.md @@ -21,6 +21,11 @@ Variable names are case _sensitive_. Variable names also cannot be the same as a [keyword]. +### Unicode Standard Annex #31 Identifiers + +The [`unicode-xid-ident`] feature expands the allowed characters for variable names to the set defined by +[Unicode Standard Annex #31](http://www.unicode.org/reports/tr31/). + Declare a Variable ------------------ diff --git a/doc/src/links.md b/doc/src/links.md index 889cee29..936c714d 100644 --- a/doc/src/links.md +++ b/doc/src/links.md @@ -12,6 +12,7 @@ [`no_std`]: {{rootUrl}}/start/features.md [`no-std`]: {{rootUrl}}/start/features.md [`internals`]: {{rootUrl}}/start/features.md +[`unicode-xid-ident`]: {{rootUrl}}/start/features.md [minimal builds]: {{rootUrl}}/start/builds/minimal.md [WASM]: {{rootUrl}}/start/builds/wasm.md diff --git a/doc/src/start/features.md b/doc/src/start/features.md index 0d31d077..0f6b0f53 100644 --- a/doc/src/start/features.md +++ b/doc/src/start/features.md @@ -11,21 +11,22 @@ Notice that this deviates from Rust norm where features are _additive_. Excluding unneeded functionalities can result in smaller, faster builds as well as more control over what a script can (or cannot) do. -| Feature | Description | -| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `unchecked` | Disable arithmetic checking (such as over-flows and division by zero), call stack depth limit, operations count limit and modules loading limit.
Beware that a bad script may panic the entire system! | -| `sync` | Restrict all values types to those that are `Send + Sync`. Under this feature, all Rhai types, including [`Engine`], [`Scope`] and [`AST`], are all `Send + Sync`. | -| `no_optimize` | Disable [script optimization]. | -| `no_float` | Disable floating-point numbers and math. | -| `only_i32` | Set the system integer type to `i32` and disable all other integer types. `INT` is set to `i32`. | -| `only_i64` | Set the system integer type to `i64` and disable all other integer types. `INT` is set to `i64`. | -| `no_index` | Disable [arrays] and indexing features. | -| `no_object` | Disable support for [custom types] and [object maps]. | -| `no_function` | Disable script-defined [functions]. | -| `no_module` | Disable loading external [modules]. | -| `no_std` | Build for `no-std`. Notice that additional dependencies will be pulled in to replace `std` features. | -| `serde` | Enable serialization/deserialization via `serde`. Notice that the [`serde`](https://crates.io/crates/serde) crate will be pulled in together with its dependencies. | -| `internals` | Expose internal data structures (e.g. [`AST`] nodes). Beware that Rhai internals are volatile and may change from version to version. | +| Feature | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `unchecked` | Disable arithmetic checking (such as over-flows and division by zero), call stack depth limit, operations count limit and modules loading limit.
Beware that a bad script may panic the entire system! | +| `sync` | Restrict all values types to those that are `Send + Sync`. Under this feature, all Rhai types, including [`Engine`], [`Scope`] and [`AST`], are all `Send + Sync`. | +| `no_optimize` | Disable [script optimization]. | +| `no_float` | Disable floating-point numbers and math. | +| `only_i32` | Set the system integer type to `i32` and disable all other integer types. `INT` is set to `i32`. | +| `only_i64` | Set the system integer type to `i64` and disable all other integer types. `INT` is set to `i64`. | +| `no_index` | Disable [arrays] and indexing features. | +| `no_object` | Disable support for [custom types] and [object maps]. | +| `no_function` | Disable script-defined [functions]. | +| `no_module` | Disable loading external [modules]. | +| `no_std` | Build for `no-std`. Notice that additional dependencies will be pulled in to replace `std` features. | +| `serde` | Enable serialization/deserialization via `serde`. Notice that the [`serde`](https://crates.io/crates/serde) crate will be pulled in together with its dependencies. | +| `internals` | Expose internal data structures (e.g. [`AST`] nodes). Beware that Rhai internals are volatile and may change from version to version. | +| `unicode-xid-ident` | Allow [Unicode Standard Annex #31](http://www.unicode.org/reports/tr31/) as identifiers. | Example diff --git a/src/fn_call.rs b/src/fn_call.rs index ccdd14d5..96e81580 100644 --- a/src/fn_call.rs +++ b/src/fn_call.rs @@ -931,8 +931,8 @@ pub fn run_builtin_binary_op( } if args_type == TypeId::of::() { - let x = *x.downcast_ref::().unwrap(); - let y = *y.downcast_ref::().unwrap(); + let x = x.clone().cast::(); + let y = y.clone().cast::(); #[cfg(not(feature = "unchecked"))] match op { @@ -973,8 +973,8 @@ pub fn run_builtin_binary_op( _ => (), } } else if args_type == TypeId::of::() { - let x = *x.downcast_ref::().unwrap(); - let y = *y.downcast_ref::().unwrap(); + let x = x.clone().cast::(); + let y = y.clone().cast::(); match op { "&" => return Ok(Some((x && y).into())), @@ -999,8 +999,8 @@ pub fn run_builtin_binary_op( _ => (), } } else if args_type == TypeId::of::() { - let x = *x.downcast_ref::().unwrap(); - let y = *y.downcast_ref::().unwrap(); + let x = x.clone().cast::(); + let y = y.clone().cast::(); match op { "==" => return Ok(Some((x == y).into())), @@ -1021,8 +1021,8 @@ pub fn run_builtin_binary_op( #[cfg(not(feature = "no_float"))] if args_type == TypeId::of::() { - let x = *x.downcast_ref::().unwrap(); - let y = *y.downcast_ref::().unwrap(); + let x = x.clone().cast::(); + let y = y.clone().cast::(); match op { "+" => return Ok(Some((x + y).into())), @@ -1060,7 +1060,7 @@ pub fn run_builtin_op_assignment( if args_type == TypeId::of::() { let x = x.downcast_mut::().unwrap(); - let y = *y.downcast_ref::().unwrap(); + let y = y.clone().cast::(); #[cfg(not(feature = "unchecked"))] match op { @@ -1096,7 +1096,7 @@ pub fn run_builtin_op_assignment( } } else if args_type == TypeId::of::() { let x = x.downcast_mut::().unwrap(); - let y = *y.downcast_ref::().unwrap(); + let y = y.clone().cast::(); match op { "&=" => return Ok(Some(*x = *x && y)), @@ -1116,7 +1116,7 @@ pub fn run_builtin_op_assignment( #[cfg(not(feature = "no_float"))] if args_type == TypeId::of::() { let x = x.downcast_mut::().unwrap(); - let y = *y.downcast_ref::().unwrap(); + let y = y.clone().cast::(); match op { "+=" => return Ok(Some(*x += y)), diff --git a/src/fn_native.rs b/src/fn_native.rs index 6cd90321..116c62ab 100644 --- a/src/fn_native.rs +++ b/src/fn_native.rs @@ -300,11 +300,9 @@ impl CallableFunction { /// Get the access mode. pub fn access(&self) -> FnAccess { match self { - CallableFunction::Plugin(_) => FnAccess::Public, - CallableFunction::Pure(_) - | CallableFunction::Method(_) - | CallableFunction::Iterator(_) => FnAccess::Public, - CallableFunction::Script(f) => f.access, + Self::Plugin(_) => FnAccess::Public, + Self::Pure(_) | Self::Method(_) | Self::Iterator(_) => FnAccess::Public, + Self::Script(f) => f.access, } } /// Get a reference to a native Rust function. diff --git a/src/packages/string_more.rs b/src/packages/string_more.rs index 62a5fdb2..6ddc2b0c 100644 --- a/src/packages/string_more.rs +++ b/src/packages/string_more.rs @@ -15,7 +15,7 @@ use crate::stdlib::{ any::TypeId, boxed::Box, fmt::Display, - format, + format, mem, string::{String, ToString}, vec::Vec, }; @@ -242,7 +242,7 @@ def_package!(crate:MoreStringPackage:"Additional string utilities, including str } if len > 0 { - let ch = *args[2].downcast_ref::< char>().unwrap(); + let ch = mem::take(args[2]).cast::(); let s = args[0].downcast_mut::().unwrap(); let orig_len = s.chars().count(); diff --git a/src/token.rs b/src/token.rs index b8576133..77187304 100644 --- a/src/token.rs +++ b/src/token.rs @@ -30,8 +30,11 @@ pub type TokenStream<'a, 't> = Peekable>; /// A location (line number + character position) in the input script. /// -/// In order to keep footprint small, both line number and character position have 16-bit unsigned resolution, -/// meaning they go up to a maximum of 65,535 lines and characters per line. +/// # Limitations +/// +/// In order to keep footprint small, both line number and character position have 16-bit resolution, +/// meaning they go up to a maximum of 65,535 lines and 65,535 characters per line. +/// /// Advancing beyond the maximum line length or maximum number of lines is not an error but has no effect. #[derive(Eq, PartialEq, Ord, PartialOrd, Hash, Clone, Copy)] pub struct Position { @@ -43,6 +46,13 @@ pub struct Position { impl Position { /// Create a new `Position`. + /// + /// `line` must not be zero. + /// If `position` is zero, then it is at the beginning of a line. + /// + /// # Panics + /// + /// Panics if `line` is zero. pub fn new(line: u16, position: u16) -> Self { assert!(line != 0, "line cannot be zero"); @@ -52,7 +62,7 @@ impl Position { } } - /// Get the line number (1-based), or `None` if no position. + /// Get the line number (1-based), or `None` if there is no position. pub fn line(&self) -> Option { if self.is_none() { None @@ -85,7 +95,6 @@ impl Position { /// # Panics /// /// Panics if already at beginning of a line - cannot rewind to a previous line. - /// pub(crate) fn rewind(&mut self) { assert!(!self.is_none(), "cannot rewind Position::none"); assert!(self.pos > 0, "cannot rewind at position 0"); @@ -104,7 +113,7 @@ impl Position { } /// Create a `Position` representing no position. - pub(crate) fn none() -> Self { + pub fn none() -> Self { Self { line: 0, pos: 0 } } @@ -146,9 +155,9 @@ impl fmt::Debug for Position { pub enum Token { /// An `INT` constant. IntegerConstant(INT), - /// A `FLOAT` constaint. + /// A `FLOAT` constant. /// - /// Never appears under the `no_float` feature. + /// Reserved under the `no_float` feature. #[cfg(not(feature = "no_float"))] FloatConstant(FLOAT), /// An identifier. @@ -249,7 +258,7 @@ pub enum Token { And, /// `fn` /// - /// Never appears under the `no_function` feature. + /// Reserved under the `no_function` feature. #[cfg(not(feature = "no_function"))] Fn, /// `continue` @@ -284,22 +293,22 @@ pub enum Token { PowerOfAssign, /// `private` /// - /// Never appears under the `no_function` feature. + /// Reserved under the `no_function` feature. #[cfg(not(feature = "no_function"))] Private, /// `import` /// - /// Never appears under the `no_module` feature. + /// Reserved under the `no_module` feature. #[cfg(not(feature = "no_module"))] Import, /// `export` /// - /// Never appears under the `no_module` feature. + /// Reserved under the `no_module` feature. #[cfg(not(feature = "no_module"))] Export, /// `as` /// - /// Never appears under the `no_module` feature. + /// Reserved under the `no_module` feature. #[cfg(not(feature = "no_module"))] As, /// A lexer error. @@ -643,7 +652,7 @@ impl Token { } } - /// Is this token a standard keyword? + /// Is this token an active standard keyword? pub fn is_keyword(&self) -> bool { use Token::*; @@ -670,7 +679,7 @@ impl Token { } /// Convert a token into a function name, if possible. - pub fn into_function_name(self) -> Result { + pub(crate) fn into_function_name(self) -> Result { match self { Self::Reserved(s) if is_keyword_function(&s) => Ok(s), Self::Custom(s) | Self::Identifier(s) if is_valid_identifier(s.chars()) => Ok(s), @@ -726,32 +735,6 @@ pub trait InputStream { fn peek_next(&mut self) -> Option; } -pub fn is_keyword_function(name: &str) -> bool { - name == KEYWORD_PRINT - || name == KEYWORD_DEBUG - || name == KEYWORD_TYPE_OF - || name == KEYWORD_EVAL - || name == KEYWORD_FN_PTR - || name == KEYWORD_FN_PTR_CALL - || name == KEYWORD_FN_PTR_CURRY -} - -pub fn is_valid_identifier(name: impl Iterator) -> bool { - let mut first_alphabetic = false; - - for ch in name { - match ch { - '_' => (), - _ if char::is_ascii_alphabetic(&ch) => first_alphabetic = true, - _ if !first_alphabetic => return false, - _ if char::is_ascii_alphanumeric(&ch) => (), - _ => return false, - } - } - - first_alphabetic -} - /// [INTERNALS] Parse a string literal wrapped by `enclosing_char`. /// Exported under the `internals` feature only. /// @@ -1098,35 +1081,7 @@ fn get_next_token_inner( // letter or underscore ... ('A'..='Z', _) | ('a'..='z', _) | ('_', _) => { - let mut result = Vec::new(); - result.push(c); - - while let Some(next_char) = stream.peek_next() { - match next_char { - x if x.is_ascii_alphanumeric() || x == '_' => { - result.push(x); - eat_next(stream, pos); - } - _ => break, - } - } - - let is_valid_identifier = is_valid_identifier(result.iter().cloned()); - - let identifier: String = result.into_iter().collect(); - - if !is_valid_identifier { - return Some(( - Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))), - start_pos, - )); - } - - return Some(( - Token::lookup_from_syntax(&identifier) - .unwrap_or_else(|| Token::Identifier(identifier)), - start_pos, - )); + return get_identifier(stream, pos, start_pos, c); } // " - string literal @@ -1149,7 +1104,7 @@ fn get_next_token_inner( |err| (Token::LexError(Box::new(err.0)), err.1), |result| { let mut chars = result.chars(); - let first = chars.next(); + let first = chars.next().unwrap(); if chars.next().is_some() { ( @@ -1157,10 +1112,7 @@ fn get_next_token_inner( start_pos, ) } else { - ( - Token::CharConstant(first.expect("should be Some")), - start_pos, - ) + (Token::CharConstant(first), start_pos) } }, )) @@ -1404,6 +1356,10 @@ fn get_next_token_inner( ('\0', _) => unreachable!(), (ch, _) if ch.is_whitespace() => (), + #[cfg(feature = "unicode-xid-ident")] + (ch, _) if unicode_xid::UnicodeXID::is_xid_start(ch) => { + return get_identifier(stream, pos, start_pos, c); + } (ch, _) => { return Some(( Token::LexError(Box::new(LERR::UnexpectedInput(ch.to_string()))), @@ -1422,6 +1378,95 @@ fn get_next_token_inner( } } +/// Get the next identifier. +fn get_identifier( + stream: &mut impl InputStream, + pos: &mut Position, + start_pos: Position, + first_char: char, +) -> Option<(Token, Position)> { + let mut result = Vec::new(); + result.push(first_char); + + while let Some(next_char) = stream.peek_next() { + match next_char { + x if is_id_continue(x) => { + result.push(x); + eat_next(stream, pos); + } + _ => break, + } + } + + let is_valid_identifier = is_valid_identifier(result.iter().cloned()); + + let identifier: String = result.into_iter().collect(); + + if !is_valid_identifier { + return Some(( + Token::LexError(Box::new(LERR::MalformedIdentifier(identifier))), + start_pos, + )); + } + + return Some(( + Token::lookup_from_syntax(&identifier).unwrap_or_else(|| Token::Identifier(identifier)), + start_pos, + )); +} + +/// Is this keyword allowed as a function? +#[inline(always)] +pub fn is_keyword_function(name: &str) -> bool { + name == KEYWORD_PRINT + || name == KEYWORD_DEBUG + || name == KEYWORD_TYPE_OF + || name == KEYWORD_EVAL + || name == KEYWORD_FN_PTR + || name == KEYWORD_FN_PTR_CALL + || name == KEYWORD_FN_PTR_CURRY +} + +pub fn is_valid_identifier(name: impl Iterator) -> bool { + let mut first_alphabetic = false; + + for ch in name { + match ch { + '_' => (), + _ if is_id_first_alphabetic(ch) => first_alphabetic = true, + _ if !first_alphabetic => return false, + _ if char::is_ascii_alphanumeric(&ch) => (), + _ => return false, + } + } + + first_alphabetic +} + +#[cfg(feature = "unicode-xid-ident")] +#[inline(always)] +fn is_id_first_alphabetic(x: char) -> bool { + unicode_xid::UnicodeXID::is_xid_start(x) +} + +#[cfg(feature = "unicode-xid-ident")] +#[inline(always)] +fn is_id_continue(x: char) -> bool { + unicode_xid::UnicodeXID::is_xid_continue(x) +} + +#[cfg(not(feature = "unicode-xid-ident"))] +#[inline(always)] +fn is_id_first_alphabetic(x: char) -> bool { + x.is_ascii_alphabetic() +} + +#[cfg(not(feature = "unicode-xid-ident"))] +#[inline(always)] +fn is_id_continue(x: char) -> bool { + x.is_ascii_alphanumeric() || x == '_' +} + /// A type that implements the `InputStream` trait. /// Multiple character streams are jointed together to form one single stream. pub struct MultiInputsStream<'a> { diff --git a/tests/tokens.rs b/tests/tokens.rs index 523beab7..e393663d 100644 --- a/tests/tokens.rs +++ b/tests/tokens.rs @@ -51,3 +51,29 @@ fn test_tokens_custom_operator() -> Result<(), Box> { Ok(()) } + +#[test] +fn test_tokens_unicode_xid_ident() -> Result<(), Box> { + let engine = Engine::new(); + let result = engine.eval::( + r" + fn すべての答え() { 42 } + すべての答え() + ", + ); + #[cfg(feature = "unicode-xid-ident")] + assert_eq!(result?, 42); + + #[cfg(not(feature = "unicode-xid-ident"))] + assert!(result.is_err()); + + let result = engine.eval::( + r" + fn _1() { 1 } + _1() + ", + ); + assert!(result.is_err()); + + Ok(()) +}