add phone_number_mappungs
- **important** Uppercase in helper_functions::normalize_helper
This commit is contained in:
@@ -4,3 +4,4 @@ mod interfaces;
|
||||
mod proto_gen;
|
||||
mod phonenumberutil;
|
||||
mod regexp_cache;
|
||||
mod regex_based_matcher;
|
||||
@@ -261,7 +261,7 @@ pub(super) fn normalize_helper(
|
||||
let mut normalized_number = String::with_capacity(phone_number.len());
|
||||
// Skip UTF checking because strings in rust are valid UTF-8 already
|
||||
for phone_char in phone_number.chars() {
|
||||
if let Some(replacement) = normalization_replacements.get(&phone_char) {
|
||||
if let Some(replacement) = normalization_replacements.get(&phone_char.to_ascii_uppercase()) {
|
||||
normalized_number.push(*replacement);
|
||||
} else if !remove_non_matches {
|
||||
normalized_number.push(phone_char);
|
||||
|
||||
@@ -3,6 +3,7 @@ pub mod helper_functions;
|
||||
mod enums;
|
||||
mod phonenumberutil;
|
||||
mod regex_and_mappings;
|
||||
mod phone_number_regexps_and_mappings;
|
||||
|
||||
pub use enums::{MatchType, PhoneNumberFormat, PhoneNumberType, ValidationResultErr, ValidNumberLenType};
|
||||
use thiserror::Error;
|
||||
|
||||
310
src/phonenumberutil/phone_number_regexps_and_mappings.rs
Normal file
310
src/phonenumberutil/phone_number_regexps_and_mappings.rs
Normal file
@@ -0,0 +1,310 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
use crate::{phonenumberutil::{helper_constants::{self, CAPTURE_UP_TO_SECOND_NUMBER_START, DIGITS, MIN_LENGTH_FOR_NSN, PLUS_CHARS, PLUS_SIGN, RFC3966_VISUAL_SEPARATOR, STAR_SIGN, VALID_ALPHA, VALID_ALPHA_INCL_UPPERCASE, VALID_PUNCTUATION}, helper_functions::create_extn_pattern}, regexp_cache::RegexCache};
|
||||
|
||||
pub(super) struct PhoneNumberRegExpsAndMappings {
|
||||
/// Regular expression of viable phone numbers. This is location independent.
|
||||
/// Checks we have at least three leading digits, and only valid punctuation,
|
||||
/// alpha characters and digits in the phone number. Does not include extension
|
||||
/// data. The symbol 'x' is allowed here as valid punctuation since it is often
|
||||
/// used as a placeholder for carrier codes, for example in Brazilian phone
|
||||
/// numbers. We also allow multiple plus-signs at the start.
|
||||
///
|
||||
/// Corresponds to the following:
|
||||
/// `[digits]{minLengthNsn}|
|
||||
/// plus_sign*(([punctuation]|[star])*[digits]){3,}
|
||||
/// ([punctuation]|[star]|[digits]|[alpha])*`
|
||||
///
|
||||
/// The first reg-ex is to allow short numbers (two digits long) to be parsed
|
||||
/// if they are entered as "15" etc, but only if there is no punctuation in
|
||||
/// them. The second expression restricts the number of digits to three or
|
||||
/// more, but then allows them to be in international form, and to have
|
||||
/// alpha-characters and punctuation.
|
||||
valid_phone_number: String,
|
||||
|
||||
/// Regexp of all possible ways to write extensions, for use when parsing. This
|
||||
/// will be run as a case-insensitive regexp match. Wide character versions are
|
||||
/// also provided after each ASCII version.
|
||||
/// For parsing, we are slightly more lenient in our interpretation than for
|
||||
/// matching. Here we allow "comma" and "semicolon" as possible extension
|
||||
/// indicators. When matching, these are hardly ever used to indicate this.
|
||||
extn_patterns_for_parsing: String,
|
||||
|
||||
/// Regular expressions of different parts of the phone-context parameter,
|
||||
/// following the syntax defined in RFC3966.
|
||||
rfc3966_phone_digit: String,
|
||||
alphanum: String,
|
||||
rfc3966_domainlabel: String,
|
||||
rfc3966_toplabel: String,
|
||||
|
||||
pub regexp_cache: RegexCache,
|
||||
|
||||
/// A map that contains characters that are essential when dialling. That means
|
||||
/// any of the characters in this map must not be removed from a number when
|
||||
/// dialing, otherwise the call will not reach the intended destination.
|
||||
pub diallable_char_mappings: HashMap<char, char>,
|
||||
/// These mappings map a character (key) to a specific digit that should
|
||||
/// replace it for normalization purposes.
|
||||
pub alpha_mappings: HashMap<char, char>,
|
||||
/// For performance reasons, store a map of combining alpha_mappings with ASCII
|
||||
/// digits.
|
||||
pub alpha_phone_mappings: HashMap<char, char>,
|
||||
|
||||
/// Separate map of all symbols that we wish to retain when formatting alpha
|
||||
/// numbers. This includes digits, ascii letters and number grouping symbols
|
||||
/// such as "-" and " ".
|
||||
pub all_plus_number_grouping_symbols: HashMap<char, char>,
|
||||
|
||||
/// Map of country calling codes that use a mobile token before the area code.
|
||||
/// One example of when this is relevant is when determining the length of the
|
||||
/// national destination code, which should be the length of the area code plus
|
||||
/// the length of the mobile token.
|
||||
pub mobile_token_mappings: HashMap<i32, char>,
|
||||
|
||||
/// Set of country codes that doesn't have national prefix, but it has area
|
||||
/// codes.
|
||||
pub countries_without_national_prefix_with_area_codes: HashSet<i32>,
|
||||
|
||||
/// Set of country codes that have geographically assigned mobile numbers (see
|
||||
/// geo_mobile_countries_ below) which are not based on *area codes*. For
|
||||
/// example, in China mobile numbers start with a carrier indicator, and beyond
|
||||
/// that are geographically assigned: this carrier indicator is not considered
|
||||
/// to be an area code.
|
||||
pub geo_mobile_countries_without_mobile_area_codes: HashSet<i32>,
|
||||
|
||||
/// Set of country calling codes that have geographically assigned mobile
|
||||
/// numbers. This may not be complete; we add calling codes case by case, as we
|
||||
/// find geographical mobile numbers or hear from user reports.
|
||||
pub geo_mobile_countries: HashSet<i32>,
|
||||
|
||||
/// Pattern that makes it easy to distinguish whether a region has a single
|
||||
/// international dialing prefix or not. If a region has a single international
|
||||
/// prefix (e.g. 011 in USA), it will be represented as a string that contains
|
||||
/// a sequence of ASCII digits, and possibly a tilde, which signals waiting for
|
||||
/// the tone. If there are multiple available international prefixes in a
|
||||
/// region, they will be represented as a regex string that always contains one
|
||||
/// or more characters that are not ASCII digits or a tilde.
|
||||
pub single_international_prefix: Regex,
|
||||
|
||||
pub digits_pattern: Regex,
|
||||
pub capturing_digit_pattern: Regex,
|
||||
pub capturing_ascii_digits_pattern: Regex,
|
||||
|
||||
/// Regular expression of acceptable characters that may start a phone number
|
||||
/// for the purposes of parsing. This allows us to strip away meaningless
|
||||
/// prefixes to phone numbers that may be mistakenly given to us. This consists
|
||||
/// of digits, the plus symbol and arabic-indic digits. This does not contain
|
||||
/// alpha characters, although they may be used later in the number. It also
|
||||
/// does not include other punctuation, as this will be stripped later during
|
||||
/// parsing and is of no information value when parsing a number. The string
|
||||
/// starting with this valid character is captured.
|
||||
/// This corresponds to VALID_START_CHAR in the java version.
|
||||
pub valid_start_char_pattern: Regex,
|
||||
|
||||
/// Regular expression of valid characters before a marker that might indicate
|
||||
/// a second number.
|
||||
pub capture_up_to_second_number_start_pattern: Regex,
|
||||
|
||||
/// Regular expression of trailing characters that we want to remove. We remove
|
||||
/// all characters that are not alpha or numerical characters. The hash
|
||||
/// character is retained here, as it may signify the previous block was an
|
||||
/// extension. Note the capturing block at the start to capture the rest of the
|
||||
/// number if this was a match.
|
||||
/// This corresponds to UNWANTED_END_CHAR_PATTERN in the java version.
|
||||
pub unwanted_end_char_pattern: Regex,
|
||||
|
||||
/// Regular expression of groups of valid punctuation characters.
|
||||
pub separator_pattern: Regex,
|
||||
|
||||
/// Regexp of all possible ways to write extensions, for use when finding phone
|
||||
/// numbers in text. This will be run as a case-insensitive regexp match. Wide
|
||||
/// character versions are also provided after each ASCII version.
|
||||
pub extn_patterns_for_matching: String,
|
||||
|
||||
/// Regexp of all known extension prefixes used by different regions followed
|
||||
/// by 1 or more valid digits, for use when parsing.
|
||||
pub extn_pattern: Regex,
|
||||
|
||||
/// We append optionally the extension pattern to the end here, as a valid
|
||||
/// phone number may have an extension prefix appended, followed by 1 or more
|
||||
/// digits.
|
||||
pub valid_phone_number_pattern: Regex,
|
||||
|
||||
/// We use this pattern to check if the phone number has at least three letters
|
||||
/// in it - if so, then we treat it as a number where some phone-number digits
|
||||
/// are represented by letters.
|
||||
pub valid_alpha_phone_pattern: Regex,
|
||||
|
||||
pub first_group_capturing_pattern: Regex,
|
||||
|
||||
pub carrier_code_pattern: Regex,
|
||||
|
||||
pub plus_chars_pattern: Regex,
|
||||
|
||||
/// Regular expression of valid global-number-digits for the phone-context
|
||||
/// parameter, following the syntax defined in RFC3966.
|
||||
pub rfc3966_global_number_digits_pattern: Regex,
|
||||
|
||||
/// Regular expression of valid domainname for the phone-context parameter,
|
||||
/// following the syntax defined in RFC3966.
|
||||
pub rfc3966_domainname_pattern: Regex,
|
||||
}
|
||||
|
||||
impl PhoneNumberRegExpsAndMappings {
|
||||
fn initialize_regexp_mappings(&mut self) {
|
||||
self.mobile_token_mappings.insert(54, '9');
|
||||
|
||||
self.geo_mobile_countries_without_mobile_area_codes.insert(86); // China
|
||||
|
||||
self.countries_without_national_prefix_with_area_codes.insert(52); // Mexico
|
||||
|
||||
self.geo_mobile_countries.insert(52); // Mexico
|
||||
self.geo_mobile_countries.insert(54); // Argentina
|
||||
self.geo_mobile_countries.insert(55); // Brazil
|
||||
self.geo_mobile_countries.insert(62); // Indonesia: some prefixes only (fixed CMDA wireless)
|
||||
self.geo_mobile_countries.extend(&self.geo_mobile_countries_without_mobile_area_codes);
|
||||
|
||||
// Simple ASCII digits map used to populate ALPHA_PHONE_MAPPINGS and
|
||||
// ALL_PLUS_NUMBER_GROUPING_SYMBOLS.
|
||||
let mut ascii_digit_mappings = HashMap::with_capacity(10);
|
||||
for d in '0'..'9' {
|
||||
ascii_digit_mappings.insert(d, d);
|
||||
}
|
||||
|
||||
let mut alpha_map = HashMap::with_capacity(40);
|
||||
alpha_map.insert('A', '2');
|
||||
alpha_map.insert('B', '2');
|
||||
alpha_map.insert('C', '2');
|
||||
alpha_map.insert('D', '3');
|
||||
alpha_map.insert('E', '3');
|
||||
alpha_map.insert('F', '3');
|
||||
alpha_map.insert('G', '4');
|
||||
alpha_map.insert('H', '4');
|
||||
alpha_map.insert('I', '4');
|
||||
alpha_map.insert('J', '5');
|
||||
alpha_map.insert('K', '5');
|
||||
alpha_map.insert('L', '5');
|
||||
alpha_map.insert('M', '6');
|
||||
alpha_map.insert('N', '6');
|
||||
alpha_map.insert('O', '6');
|
||||
alpha_map.insert('P', '7');
|
||||
alpha_map.insert('Q', '7');
|
||||
alpha_map.insert('R', '7');
|
||||
alpha_map.insert('S', '7');
|
||||
alpha_map.insert('T', '8');
|
||||
alpha_map.insert('U', '8');
|
||||
alpha_map.insert('V', '8');
|
||||
alpha_map.insert('W', '9');
|
||||
alpha_map.insert('X', '9');
|
||||
alpha_map.insert('Y', '9');
|
||||
alpha_map.insert('Z', '9');
|
||||
self.alpha_mappings = alpha_map;
|
||||
|
||||
let mut combined_map = HashMap::with_capacity(100);
|
||||
combined_map.extend(self.alpha_mappings.iter());
|
||||
combined_map.extend(ascii_digit_mappings.iter());
|
||||
self.alpha_phone_mappings = combined_map;
|
||||
|
||||
let mut dilatable_char_map = HashMap::new();
|
||||
dilatable_char_map.extend(ascii_digit_mappings.iter());
|
||||
dilatable_char_map.insert('+', '+');
|
||||
dilatable_char_map.insert('*', '*');
|
||||
dilatable_char_map.insert('#', '#');
|
||||
self.diallable_char_mappings = dilatable_char_map;
|
||||
|
||||
let mut all_plus_number_groupings = HashMap::new();
|
||||
// insert (lower letter -> upper letter) and (upper letter -> upper letter) mappings.
|
||||
for c in self.alpha_mappings.keys() {
|
||||
all_plus_number_groupings.insert(c.to_ascii_lowercase(), *c);
|
||||
all_plus_number_groupings.insert(*c, *c);
|
||||
}
|
||||
all_plus_number_groupings.extend(ascii_digit_mappings.iter());
|
||||
// insert grouping symbols.
|
||||
all_plus_number_groupings.insert('-', '-');
|
||||
all_plus_number_groupings.insert('\u{FF0D}', '-');
|
||||
all_plus_number_groupings.insert('\u{2010}', '-');
|
||||
all_plus_number_groupings.insert('\u{2011}', '-');
|
||||
all_plus_number_groupings.insert('\u{2012}', '-');
|
||||
all_plus_number_groupings.insert('\u{2013}', '-');
|
||||
all_plus_number_groupings.insert('\u{2014}', '-');
|
||||
all_plus_number_groupings.insert('\u{2015}', '-');
|
||||
all_plus_number_groupings.insert('\u{2212}', '-');
|
||||
all_plus_number_groupings.insert('/', '/');
|
||||
all_plus_number_groupings.insert('\u{FF0F}', '/');
|
||||
all_plus_number_groupings.insert(' ', ' ');
|
||||
all_plus_number_groupings.insert('\u{3000}', ' ');
|
||||
all_plus_number_groupings.insert('\u{2060}', ' ');
|
||||
all_plus_number_groupings.insert('.', '.');
|
||||
all_plus_number_groupings.insert('\u{FF0E}', '.');
|
||||
self.all_plus_number_grouping_symbols = all_plus_number_groupings;
|
||||
}
|
||||
|
||||
fn new() -> Self {
|
||||
let alphanum = fast_cat::concat_str!(VALID_ALPHA_INCL_UPPERCASE, DIGITS);
|
||||
let extn_patterns_for_parsing = create_extn_pattern(true);
|
||||
let valid_phone_number = format!(
|
||||
"{}{{{}}}|[{}]*(?:[{}{}]*{}){{3,}}[{}{}{}{}]*",
|
||||
DIGITS, MIN_LENGTH_FOR_NSN, PLUS_CHARS,
|
||||
VALID_PUNCTUATION, STAR_SIGN, DIGITS,
|
||||
VALID_PUNCTUATION, STAR_SIGN, VALID_ALPHA, DIGITS
|
||||
);
|
||||
|
||||
let rfc3966_phone_digit = format!("({}|{})", DIGITS, RFC3966_VISUAL_SEPARATOR);
|
||||
let rfc3966_domainlabel = format!("[{}]+((\\-)*[{}])*", alphanum, alphanum);
|
||||
let rfc3966_toplabel = format!("[{}]+((\\-)*[{}])*", VALID_ALPHA_INCL_UPPERCASE, alphanum);
|
||||
|
||||
let mut instance = Self{
|
||||
// it'll be initialized only once, so we can use slow format!
|
||||
valid_phone_number: valid_phone_number.clone(),
|
||||
extn_patterns_for_parsing: extn_patterns_for_parsing.clone(),
|
||||
rfc3966_phone_digit: rfc3966_phone_digit.clone(),
|
||||
alphanum: alphanum,
|
||||
rfc3966_domainlabel: rfc3966_domainlabel.clone(),
|
||||
rfc3966_toplabel: rfc3966_toplabel.clone(),
|
||||
regexp_cache: RegexCache::with_capacity(128),
|
||||
diallable_char_mappings: Default::default(),
|
||||
alpha_mappings: Default::default(),
|
||||
alpha_phone_mappings: Default::default(),
|
||||
all_plus_number_grouping_symbols: Default::default(),
|
||||
mobile_token_mappings: Default::default(),
|
||||
countries_without_national_prefix_with_area_codes: Default::default(),
|
||||
geo_mobile_countries: Default::default(),
|
||||
geo_mobile_countries_without_mobile_area_codes: Default::default(),
|
||||
single_international_prefix: Regex::new("[\\d]+(?:[~\u{2053}\u{223C}\u{FF5E}][\\d]+)?").unwrap(),
|
||||
digits_pattern: Regex::new(&format!("[{}]*", DIGITS)).unwrap(),
|
||||
capturing_digit_pattern: Regex::new(&format!("([{}])", DIGITS)).unwrap(),
|
||||
capturing_ascii_digits_pattern: Regex::new("(\\d+)").unwrap(),
|
||||
valid_start_char_pattern: Regex::new(&format!("[{}{}]", PLUS_CHARS, DIGITS)).unwrap(),
|
||||
capture_up_to_second_number_start_pattern: Regex::new(CAPTURE_UP_TO_SECOND_NUMBER_START).unwrap(),
|
||||
unwanted_end_char_pattern: Regex::new("[^\\p{N}\\p{L}#]").unwrap(),
|
||||
separator_pattern: Regex::new(&format!("[{}]+", VALID_PUNCTUATION)).unwrap(),
|
||||
extn_patterns_for_matching: create_extn_pattern(false),
|
||||
extn_pattern: Regex::new(&format!("(?i)(?:{})$", &extn_patterns_for_parsing)).unwrap(),
|
||||
valid_phone_number_pattern: Regex::new(&format!("(?i){}(?:{})?",
|
||||
&valid_phone_number,
|
||||
extn_patterns_for_parsing
|
||||
)).unwrap(),
|
||||
valid_alpha_phone_pattern: Regex::new(&format!("(?i)(?:.*?[{}]){{3}}",
|
||||
VALID_ALPHA
|
||||
)).unwrap(),
|
||||
// The first_group_capturing_pattern was originally set to $1 but there
|
||||
// are some countries for which the first group is not used in the
|
||||
// national pattern (e.g. Argentina) so the $1 group does not match
|
||||
// correctly. Therefore, we use \d, so that the first group actually
|
||||
// used in the pattern will be matched.
|
||||
first_group_capturing_pattern: Regex::new("(\\$\\d)").unwrap(),
|
||||
carrier_code_pattern: Regex::new("\\$CC").unwrap(),
|
||||
plus_chars_pattern: Regex::new(&format!("[{}]+", &PLUS_CHARS)).unwrap(),
|
||||
rfc3966_global_number_digits_pattern: Regex::new(
|
||||
&format!("^\\{}{}*{}{}*$", PLUS_SIGN, &rfc3966_phone_digit, DIGITS, rfc3966_phone_digit)
|
||||
).unwrap(),
|
||||
rfc3966_domainname_pattern: Regex::new(
|
||||
&format!("^({}\\.)*{}\\.?$", rfc3966_domainlabel, rfc3966_toplabel)
|
||||
).unwrap(),
|
||||
};
|
||||
instance.initialize_regexp_mappings();
|
||||
instance
|
||||
}
|
||||
}
|
||||
57
src/regex_based_matcher.rs
Normal file
57
src/regex_based_matcher.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use log::{error};
|
||||
|
||||
use crate::{interfaces, proto_gen::phonemetadata::PhoneNumberDesc, regexp_cache::{self, RegexCache}};
|
||||
|
||||
pub struct RegexBasedMatcher {
|
||||
cache: RegexCache,
|
||||
}
|
||||
|
||||
impl RegexBasedMatcher {
|
||||
pub fn new() -> Self {
|
||||
Self { cache: RegexCache::with_capacity(128) }
|
||||
}
|
||||
|
||||
fn match_number(
|
||||
&self, phone_number: &str,
|
||||
number_pattern: &str,
|
||||
allow_prefix_match: bool
|
||||
) -> Result<bool, regexp_cache::ErrorInvalidRegex> {
|
||||
let regexp = self.cache.get_regex(number_pattern)?;
|
||||
|
||||
// find first occurrence
|
||||
if let Some(mat) = regexp.find(phone_number) {
|
||||
// if first position is not matched none of scenarios are possible
|
||||
if mat.start() != 0 {
|
||||
return Ok(false);
|
||||
}
|
||||
// full match
|
||||
if mat.end() == phone_number.len() {
|
||||
return Ok(true);
|
||||
} else if allow_prefix_match {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
impl interfaces::MatcherApi for RegexBasedMatcher {
|
||||
fn match_national_number(
|
||||
&self, number: &str,
|
||||
number_desc: &PhoneNumberDesc,
|
||||
allow_prefix_match: bool
|
||||
) -> bool {
|
||||
let national_number_pattern = number_desc.national_number_pattern();
|
||||
// We don't want to consider it a prefix match when matching non-empty input
|
||||
// against an empty pattern.
|
||||
if national_number_pattern.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if let Ok(res) = self.match_number(number, national_number_pattern, allow_prefix_match) {
|
||||
res
|
||||
} else {
|
||||
error!("Invalid regex! {}", national_number_pattern);
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user