From 929fdbae8ab03e118cda9bce78775fb023208ece Mon Sep 17 00:00:00 2001 From: Vlasislav Kashin <99754299+vloldik@users.noreply.github.com> Date: Sun, 29 Jun 2025 22:41:51 +0300 Subject: [PATCH] add phone_number_mappungs - **important** Uppercase in helper_functions::normalize_helper --- Cargo.lock | 495 +----------------- Cargo.toml | 2 +- src/lib.rs | 1 + src/phonenumberutil/helper_functions.rs | 2 +- src/phonenumberutil/mod.rs | 1 + .../phone_number_regexps_and_mappings.rs | 310 +++++++++++ src/regex_based_matcher.rs | 57 ++ 7 files changed, 379 insertions(+), 489 deletions(-) create mode 100644 src/phonenumberutil/phone_number_regexps_and_mappings.rs create mode 100644 src/regex_based_matcher.rs diff --git a/Cargo.lock b/Cargo.lock index 5324b4e..fc3efc9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,64 +17,24 @@ version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" -[[package]] -name = "autocfg" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dde43e75fd43e8a1bf86103336bc699aa8d17ad1be60c76c0bdfd4828e19b78" -dependencies = [ - "autocfg 1.5.0", -] - [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "base64" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "489d6c0ed21b11d038c31b6ceccca973e65d73ba3bd8ecb9a2babf5546164643" -dependencies = [ - "byteorder", - "safemem", -] - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - [[package]] name = "cfg-if" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -129,12 +89,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "fuchsia-cprng" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" - [[package]] name = "getrandom" version = "0.3.3" @@ -144,7 +98,7 @@ dependencies = [ "cfg-if", "libc", "r-efi", - "wasi 0.14.2+wasi-0.2.4", + "wasi", ] [[package]] @@ -165,12 +119,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" - [[package]] name = "home" version = "0.5.11" @@ -180,42 +128,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "httparse" -version = "1.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" - -[[package]] -name = "hyper" -version = "0.10.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a0652d9a2609a968c14be1a9ea00bf4b1d64e2e1f53a1b51b6fff3a6e829273" -dependencies = [ - "base64", - "httparse", - "language-tags", - "log 0.3.9", - "mime", - "num_cpus", - "time", - "traitobject", - "typeable", - "unicase", - "url", -] - -[[package]] -name = "idna" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f09e0f0b1fb55fdee1f17470ad800da77af5186a1a76c026b679358b7e844e" -dependencies = [ - "matches", - "unicode-bidi", - "unicode-normalization", -] - [[package]] name = "indexmap" version = "2.10.0" @@ -226,34 +138,12 @@ dependencies = [ "hashbrown 0.15.4", ] -[[package]] -name = "iron" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6d308ca2d884650a8bf9ed2ff4cb13fbb2207b71f64cda11dc9b892067295e8" -dependencies = [ - "hyper", - "log 0.3.9", - "mime_guess", - "modifier", - "num_cpus", - "plugin", - "typemap", - "url", -] - [[package]] name = "itoa" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" -[[package]] -name = "language-tags" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a" - [[package]] name = "libc" version = "0.2.174" @@ -278,85 +168,22 @@ version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" dependencies = [ - "autocfg 1.5.0", + "autocfg", "scopeguard", ] -[[package]] -name = "log" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" -dependencies = [ - "log 0.4.27", -] - [[package]] name = "log" version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" -[[package]] -name = "logger" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c9172cb4c2f6c52117e25570983edcbb322f130b1031ae5d5d6b1abe7eeb493" -dependencies = [ - "iron", - "log 0.3.9", - "time", -] - -[[package]] -name = "matches" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" - [[package]] name = "memchr" version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" -[[package]] -name = "mime" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba626b8a6de5da682e1caa06bdb42a335aee5a84db8e5046a3e8ab17ba0a3ae0" -dependencies = [ - "log 0.3.9", -] - -[[package]] -name = "mime_guess" -version = "1.8.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "216929a5ee4dd316b1702eedf5e74548c123d370f47841ceaac38ca154690ca3" -dependencies = [ - "mime", - "phf", - "phf_codegen", - "unicase", -] - -[[package]] -name = "modifier" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f5c9112cb662acd3b204077e0de5bc66305fa8df65c8019d5adb10e9ab6e58" - -[[package]] -name = "num_cpus" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "once_cell" version = "1.21.3" @@ -376,60 +203,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "percent-encoding" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31010dd2e1ac33d5b46a5b413495239882813e0369f8ed8a5e266f173602f831" - -[[package]] -name = "phf" -version = "0.7.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_codegen" -version = "0.7.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.7.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662" -dependencies = [ - "phf_shared", - "rand", -] - -[[package]] -name = "phf_shared" -version = "0.7.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0" -dependencies = [ - "siphasher", - "unicase", -] - -[[package]] -name = "plugin" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a6a0dc3910bc8db877ffed8e457763b317cf880df4ae19109b9f77d277cf6e0" -dependencies = [ - "typemap", -] - [[package]] name = "proc-macro2" version = "1.0.95" @@ -473,7 +246,7 @@ checksum = "b4aeaa1f2460f1d348eeaeed86aea999ce98c1bded6f089ff8514c9d9dbdc973" dependencies = [ "anyhow", "indexmap", - "log 0.4.27", + "log", "protobuf", "protobuf-support", "tempfile", @@ -505,128 +278,13 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" -[[package]] -name = "rand" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" -dependencies = [ - "autocfg 0.1.8", - "libc", - "rand_chacha", - "rand_core 0.4.2", - "rand_hc", - "rand_isaac", - "rand_jitter", - "rand_os", - "rand_pcg", - "rand_xorshift", - "winapi", -] - -[[package]] -name = "rand_chacha" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" -dependencies = [ - "autocfg 0.1.8", - "rand_core 0.3.1", -] - -[[package]] -name = "rand_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" -dependencies = [ - "rand_core 0.4.2", -] - -[[package]] -name = "rand_core" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" - -[[package]] -name = "rand_hc" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_isaac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_jitter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" -dependencies = [ - "libc", - "rand_core 0.4.2", - "winapi", -] - -[[package]] -name = "rand_os" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" -dependencies = [ - "cloudabi", - "fuchsia-cprng", - "libc", - "rand_core 0.4.2", - "rdrand", - "winapi", -] - -[[package]] -name = "rand_pcg" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" -dependencies = [ - "autocfg 0.1.8", - "rand_core 0.4.2", -] - -[[package]] -name = "rand_xorshift" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rdrand" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -dependencies = [ - "rand_core 0.3.1", -] - [[package]] name = "redox_syscall" version = "0.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" dependencies = [ - "bitflags 2.9.1", + "bitflags", ] [[package]] @@ -665,7 +323,7 @@ dependencies = [ "dashmap", "fast-cat", "itoa", - "logger", + "log", "protobuf", "protobuf-codegen", "regex", @@ -679,7 +337,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.9.1", + "bitflags", "errno", "libc", "linux-raw-sys 0.4.15", @@ -692,7 +350,7 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" dependencies = [ - "bitflags 2.9.1", + "bitflags", "errno", "libc", "linux-raw-sys 0.9.4", @@ -705,24 +363,12 @@ version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" -[[package]] -name = "safemem" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" - [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "siphasher" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" - [[package]] name = "smallvec" version = "1.15.1" @@ -815,115 +461,12 @@ dependencies = [ "syn", ] -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "tinyvec" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" - -[[package]] -name = "traitobject" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04a79e25382e2e852e8da874249358d382ebaf259d0d34e75d8db16a7efabbc7" - -[[package]] -name = "typeable" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1410f6f91f21d1612654e7cc69193b0334f909dcf2c790c4826254fbb86f8887" - -[[package]] -name = "typemap" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "653be63c80a3296da5551e1bfd2cca35227e13cdd08c6668903ae2f4f77aa1f6" -dependencies = [ - "unsafe-any", -] - -[[package]] -name = "unicase" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f4765f83163b74f957c797ad9253caf97f103fb064d3999aea9568d09fc8a33" -dependencies = [ - "version_check", -] - -[[package]] -name = "unicode-bidi" -version = "0.3.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" - [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" -[[package]] -name = "unicode-normalization" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" -dependencies = [ - "tinyvec", -] - -[[package]] -name = "unsafe-any" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30360d7979f5e9c6e6cea48af192ea8fab4afb3cf72597154b8f08935bc9c7f" -dependencies = [ - "traitobject", -] - -[[package]] -name = "url" -version = "1.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd4e7c0d531266369519a4aa4f399d748bd37043b00bde1e4ff1f60a120b355a" -dependencies = [ - "idna", - "matches", - "percent-encoding", -] - -[[package]] -name = "version_check" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" - -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" @@ -945,28 +488,6 @@ dependencies = [ "rustix 0.38.44", ] -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-sys" version = "0.59.0" @@ -1119,5 +640,5 @@ version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags 2.9.1", + "bitflags", ] diff --git a/Cargo.toml b/Cargo.toml index 292e01c..201539a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ build = "build/rust_build.rs" [dependencies] # logging standard in rust -logger = "0.4.0" +log = "0.4.27" # helpful error package thiserror = "2.0.12" # google protobuf lib required to use .proto files from assets diff --git a/src/lib.rs b/src/lib.rs index a0a1233..9ee1559 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,3 +4,4 @@ mod interfaces; mod proto_gen; mod phonenumberutil; mod regexp_cache; +mod regex_based_matcher; \ No newline at end of file diff --git a/src/phonenumberutil/helper_functions.rs b/src/phonenumberutil/helper_functions.rs index c2760de..73fe6bc 100644 --- a/src/phonenumberutil/helper_functions.rs +++ b/src/phonenumberutil/helper_functions.rs @@ -261,7 +261,7 @@ pub(super) fn normalize_helper( let mut normalized_number = String::with_capacity(phone_number.len()); // Skip UTF checking because strings in rust are valid UTF-8 already for phone_char in phone_number.chars() { - if let Some(replacement) = normalization_replacements.get(&phone_char) { + if let Some(replacement) = normalization_replacements.get(&phone_char.to_ascii_uppercase()) { normalized_number.push(*replacement); } else if !remove_non_matches { normalized_number.push(phone_char); diff --git a/src/phonenumberutil/mod.rs b/src/phonenumberutil/mod.rs index 1b75421..478458d 100644 --- a/src/phonenumberutil/mod.rs +++ b/src/phonenumberutil/mod.rs @@ -3,6 +3,7 @@ pub mod helper_functions; mod enums; mod phonenumberutil; mod regex_and_mappings; +mod phone_number_regexps_and_mappings; pub use enums::{MatchType, PhoneNumberFormat, PhoneNumberType, ValidationResultErr, ValidNumberLenType}; use thiserror::Error; diff --git a/src/phonenumberutil/phone_number_regexps_and_mappings.rs b/src/phonenumberutil/phone_number_regexps_and_mappings.rs new file mode 100644 index 0000000..94307ad --- /dev/null +++ b/src/phonenumberutil/phone_number_regexps_and_mappings.rs @@ -0,0 +1,310 @@ +use std::collections::{HashMap, HashSet}; + +use regex::Regex; + +use crate::{phonenumberutil::{helper_constants::{self, CAPTURE_UP_TO_SECOND_NUMBER_START, DIGITS, MIN_LENGTH_FOR_NSN, PLUS_CHARS, PLUS_SIGN, RFC3966_VISUAL_SEPARATOR, STAR_SIGN, VALID_ALPHA, VALID_ALPHA_INCL_UPPERCASE, VALID_PUNCTUATION}, helper_functions::create_extn_pattern}, regexp_cache::RegexCache}; + +pub(super) struct PhoneNumberRegExpsAndMappings { + /// Regular expression of viable phone numbers. This is location independent. + /// Checks we have at least three leading digits, and only valid punctuation, + /// alpha characters and digits in the phone number. Does not include extension + /// data. The symbol 'x' is allowed here as valid punctuation since it is often + /// used as a placeholder for carrier codes, for example in Brazilian phone + /// numbers. We also allow multiple plus-signs at the start. + /// + /// Corresponds to the following: + /// `[digits]{minLengthNsn}| + /// plus_sign*(([punctuation]|[star])*[digits]){3,} + /// ([punctuation]|[star]|[digits]|[alpha])*` + /// + /// The first reg-ex is to allow short numbers (two digits long) to be parsed + /// if they are entered as "15" etc, but only if there is no punctuation in + /// them. The second expression restricts the number of digits to three or + /// more, but then allows them to be in international form, and to have + /// alpha-characters and punctuation. + valid_phone_number: String, + + /// Regexp of all possible ways to write extensions, for use when parsing. This + /// will be run as a case-insensitive regexp match. Wide character versions are + /// also provided after each ASCII version. + /// For parsing, we are slightly more lenient in our interpretation than for + /// matching. Here we allow "comma" and "semicolon" as possible extension + /// indicators. When matching, these are hardly ever used to indicate this. + extn_patterns_for_parsing: String, + + /// Regular expressions of different parts of the phone-context parameter, + /// following the syntax defined in RFC3966. + rfc3966_phone_digit: String, + alphanum: String, + rfc3966_domainlabel: String, + rfc3966_toplabel: String, + + pub regexp_cache: RegexCache, + + /// A map that contains characters that are essential when dialling. That means + /// any of the characters in this map must not be removed from a number when + /// dialing, otherwise the call will not reach the intended destination. + pub diallable_char_mappings: HashMap, + /// These mappings map a character (key) to a specific digit that should + /// replace it for normalization purposes. + pub alpha_mappings: HashMap, + /// For performance reasons, store a map of combining alpha_mappings with ASCII + /// digits. + pub alpha_phone_mappings: HashMap, + + /// Separate map of all symbols that we wish to retain when formatting alpha + /// numbers. This includes digits, ascii letters and number grouping symbols + /// such as "-" and " ". + pub all_plus_number_grouping_symbols: HashMap, + + /// Map of country calling codes that use a mobile token before the area code. + /// One example of when this is relevant is when determining the length of the + /// national destination code, which should be the length of the area code plus + /// the length of the mobile token. + pub mobile_token_mappings: HashMap, + + /// Set of country codes that doesn't have national prefix, but it has area + /// codes. + pub countries_without_national_prefix_with_area_codes: HashSet, + + /// Set of country codes that have geographically assigned mobile numbers (see + /// geo_mobile_countries_ below) which are not based on *area codes*. For + /// example, in China mobile numbers start with a carrier indicator, and beyond + /// that are geographically assigned: this carrier indicator is not considered + /// to be an area code. + pub geo_mobile_countries_without_mobile_area_codes: HashSet, + + /// Set of country calling codes that have geographically assigned mobile + /// numbers. This may not be complete; we add calling codes case by case, as we + /// find geographical mobile numbers or hear from user reports. + pub geo_mobile_countries: HashSet, + + /// Pattern that makes it easy to distinguish whether a region has a single + /// international dialing prefix or not. If a region has a single international + /// prefix (e.g. 011 in USA), it will be represented as a string that contains + /// a sequence of ASCII digits, and possibly a tilde, which signals waiting for + /// the tone. If there are multiple available international prefixes in a + /// region, they will be represented as a regex string that always contains one + /// or more characters that are not ASCII digits or a tilde. + pub single_international_prefix: Regex, + + pub digits_pattern: Regex, + pub capturing_digit_pattern: Regex, + pub capturing_ascii_digits_pattern: Regex, + + /// Regular expression of acceptable characters that may start a phone number + /// for the purposes of parsing. This allows us to strip away meaningless + /// prefixes to phone numbers that may be mistakenly given to us. This consists + /// of digits, the plus symbol and arabic-indic digits. This does not contain + /// alpha characters, although they may be used later in the number. It also + /// does not include other punctuation, as this will be stripped later during + /// parsing and is of no information value when parsing a number. The string + /// starting with this valid character is captured. + /// This corresponds to VALID_START_CHAR in the java version. + pub valid_start_char_pattern: Regex, + + /// Regular expression of valid characters before a marker that might indicate + /// a second number. + pub capture_up_to_second_number_start_pattern: Regex, + + /// Regular expression of trailing characters that we want to remove. We remove + /// all characters that are not alpha or numerical characters. The hash + /// character is retained here, as it may signify the previous block was an + /// extension. Note the capturing block at the start to capture the rest of the + /// number if this was a match. + /// This corresponds to UNWANTED_END_CHAR_PATTERN in the java version. + pub unwanted_end_char_pattern: Regex, + + /// Regular expression of groups of valid punctuation characters. + pub separator_pattern: Regex, + + /// Regexp of all possible ways to write extensions, for use when finding phone + /// numbers in text. This will be run as a case-insensitive regexp match. Wide + /// character versions are also provided after each ASCII version. + pub extn_patterns_for_matching: String, + + /// Regexp of all known extension prefixes used by different regions followed + /// by 1 or more valid digits, for use when parsing. + pub extn_pattern: Regex, + + /// We append optionally the extension pattern to the end here, as a valid + /// phone number may have an extension prefix appended, followed by 1 or more + /// digits. + pub valid_phone_number_pattern: Regex, + + /// We use this pattern to check if the phone number has at least three letters + /// in it - if so, then we treat it as a number where some phone-number digits + /// are represented by letters. + pub valid_alpha_phone_pattern: Regex, + + pub first_group_capturing_pattern: Regex, + + pub carrier_code_pattern: Regex, + + pub plus_chars_pattern: Regex, + + /// Regular expression of valid global-number-digits for the phone-context + /// parameter, following the syntax defined in RFC3966. + pub rfc3966_global_number_digits_pattern: Regex, + + /// Regular expression of valid domainname for the phone-context parameter, + /// following the syntax defined in RFC3966. + pub rfc3966_domainname_pattern: Regex, +} + +impl PhoneNumberRegExpsAndMappings { + fn initialize_regexp_mappings(&mut self) { + self.mobile_token_mappings.insert(54, '9'); + + self.geo_mobile_countries_without_mobile_area_codes.insert(86); // China + + self.countries_without_national_prefix_with_area_codes.insert(52); // Mexico + + self.geo_mobile_countries.insert(52); // Mexico + self.geo_mobile_countries.insert(54); // Argentina + self.geo_mobile_countries.insert(55); // Brazil + self.geo_mobile_countries.insert(62); // Indonesia: some prefixes only (fixed CMDA wireless) + self.geo_mobile_countries.extend(&self.geo_mobile_countries_without_mobile_area_codes); + + // Simple ASCII digits map used to populate ALPHA_PHONE_MAPPINGS and + // ALL_PLUS_NUMBER_GROUPING_SYMBOLS. + let mut ascii_digit_mappings = HashMap::with_capacity(10); + for d in '0'..'9' { + ascii_digit_mappings.insert(d, d); + } + + let mut alpha_map = HashMap::with_capacity(40); + alpha_map.insert('A', '2'); + alpha_map.insert('B', '2'); + alpha_map.insert('C', '2'); + alpha_map.insert('D', '3'); + alpha_map.insert('E', '3'); + alpha_map.insert('F', '3'); + alpha_map.insert('G', '4'); + alpha_map.insert('H', '4'); + alpha_map.insert('I', '4'); + alpha_map.insert('J', '5'); + alpha_map.insert('K', '5'); + alpha_map.insert('L', '5'); + alpha_map.insert('M', '6'); + alpha_map.insert('N', '6'); + alpha_map.insert('O', '6'); + alpha_map.insert('P', '7'); + alpha_map.insert('Q', '7'); + alpha_map.insert('R', '7'); + alpha_map.insert('S', '7'); + alpha_map.insert('T', '8'); + alpha_map.insert('U', '8'); + alpha_map.insert('V', '8'); + alpha_map.insert('W', '9'); + alpha_map.insert('X', '9'); + alpha_map.insert('Y', '9'); + alpha_map.insert('Z', '9'); + self.alpha_mappings = alpha_map; + + let mut combined_map = HashMap::with_capacity(100); + combined_map.extend(self.alpha_mappings.iter()); + combined_map.extend(ascii_digit_mappings.iter()); + self.alpha_phone_mappings = combined_map; + + let mut dilatable_char_map = HashMap::new(); + dilatable_char_map.extend(ascii_digit_mappings.iter()); + dilatable_char_map.insert('+', '+'); + dilatable_char_map.insert('*', '*'); + dilatable_char_map.insert('#', '#'); + self.diallable_char_mappings = dilatable_char_map; + + let mut all_plus_number_groupings = HashMap::new(); + // insert (lower letter -> upper letter) and (upper letter -> upper letter) mappings. + for c in self.alpha_mappings.keys() { + all_plus_number_groupings.insert(c.to_ascii_lowercase(), *c); + all_plus_number_groupings.insert(*c, *c); + } + all_plus_number_groupings.extend(ascii_digit_mappings.iter()); + // insert grouping symbols. + all_plus_number_groupings.insert('-', '-'); + all_plus_number_groupings.insert('\u{FF0D}', '-'); + all_plus_number_groupings.insert('\u{2010}', '-'); + all_plus_number_groupings.insert('\u{2011}', '-'); + all_plus_number_groupings.insert('\u{2012}', '-'); + all_plus_number_groupings.insert('\u{2013}', '-'); + all_plus_number_groupings.insert('\u{2014}', '-'); + all_plus_number_groupings.insert('\u{2015}', '-'); + all_plus_number_groupings.insert('\u{2212}', '-'); + all_plus_number_groupings.insert('/', '/'); + all_plus_number_groupings.insert('\u{FF0F}', '/'); + all_plus_number_groupings.insert(' ', ' '); + all_plus_number_groupings.insert('\u{3000}', ' '); + all_plus_number_groupings.insert('\u{2060}', ' '); + all_plus_number_groupings.insert('.', '.'); + all_plus_number_groupings.insert('\u{FF0E}', '.'); + self.all_plus_number_grouping_symbols = all_plus_number_groupings; + } + + fn new() -> Self { + let alphanum = fast_cat::concat_str!(VALID_ALPHA_INCL_UPPERCASE, DIGITS); + let extn_patterns_for_parsing = create_extn_pattern(true); + let valid_phone_number = format!( + "{}{{{}}}|[{}]*(?:[{}{}]*{}){{3,}}[{}{}{}{}]*", + DIGITS, MIN_LENGTH_FOR_NSN, PLUS_CHARS, + VALID_PUNCTUATION, STAR_SIGN, DIGITS, + VALID_PUNCTUATION, STAR_SIGN, VALID_ALPHA, DIGITS + ); + + let rfc3966_phone_digit = format!("({}|{})", DIGITS, RFC3966_VISUAL_SEPARATOR); + let rfc3966_domainlabel = format!("[{}]+((\\-)*[{}])*", alphanum, alphanum); + let rfc3966_toplabel = format!("[{}]+((\\-)*[{}])*", VALID_ALPHA_INCL_UPPERCASE, alphanum); + + let mut instance = Self{ + // it'll be initialized only once, so we can use slow format! + valid_phone_number: valid_phone_number.clone(), + extn_patterns_for_parsing: extn_patterns_for_parsing.clone(), + rfc3966_phone_digit: rfc3966_phone_digit.clone(), + alphanum: alphanum, + rfc3966_domainlabel: rfc3966_domainlabel.clone(), + rfc3966_toplabel: rfc3966_toplabel.clone(), + regexp_cache: RegexCache::with_capacity(128), + diallable_char_mappings: Default::default(), + alpha_mappings: Default::default(), + alpha_phone_mappings: Default::default(), + all_plus_number_grouping_symbols: Default::default(), + mobile_token_mappings: Default::default(), + countries_without_national_prefix_with_area_codes: Default::default(), + geo_mobile_countries: Default::default(), + geo_mobile_countries_without_mobile_area_codes: Default::default(), + single_international_prefix: Regex::new("[\\d]+(?:[~\u{2053}\u{223C}\u{FF5E}][\\d]+)?").unwrap(), + digits_pattern: Regex::new(&format!("[{}]*", DIGITS)).unwrap(), + capturing_digit_pattern: Regex::new(&format!("([{}])", DIGITS)).unwrap(), + capturing_ascii_digits_pattern: Regex::new("(\\d+)").unwrap(), + valid_start_char_pattern: Regex::new(&format!("[{}{}]", PLUS_CHARS, DIGITS)).unwrap(), + capture_up_to_second_number_start_pattern: Regex::new(CAPTURE_UP_TO_SECOND_NUMBER_START).unwrap(), + unwanted_end_char_pattern: Regex::new("[^\\p{N}\\p{L}#]").unwrap(), + separator_pattern: Regex::new(&format!("[{}]+", VALID_PUNCTUATION)).unwrap(), + extn_patterns_for_matching: create_extn_pattern(false), + extn_pattern: Regex::new(&format!("(?i)(?:{})$", &extn_patterns_for_parsing)).unwrap(), + valid_phone_number_pattern: Regex::new(&format!("(?i){}(?:{})?", + &valid_phone_number, + extn_patterns_for_parsing + )).unwrap(), + valid_alpha_phone_pattern: Regex::new(&format!("(?i)(?:.*?[{}]){{3}}", + VALID_ALPHA + )).unwrap(), + // The first_group_capturing_pattern was originally set to $1 but there + // are some countries for which the first group is not used in the + // national pattern (e.g. Argentina) so the $1 group does not match + // correctly. Therefore, we use \d, so that the first group actually + // used in the pattern will be matched. + first_group_capturing_pattern: Regex::new("(\\$\\d)").unwrap(), + carrier_code_pattern: Regex::new("\\$CC").unwrap(), + plus_chars_pattern: Regex::new(&format!("[{}]+", &PLUS_CHARS)).unwrap(), + rfc3966_global_number_digits_pattern: Regex::new( + &format!("^\\{}{}*{}{}*$", PLUS_SIGN, &rfc3966_phone_digit, DIGITS, rfc3966_phone_digit) + ).unwrap(), + rfc3966_domainname_pattern: Regex::new( + &format!("^({}\\.)*{}\\.?$", rfc3966_domainlabel, rfc3966_toplabel) + ).unwrap(), + }; + instance.initialize_regexp_mappings(); + instance + } +} \ No newline at end of file diff --git a/src/regex_based_matcher.rs b/src/regex_based_matcher.rs new file mode 100644 index 0000000..5e62852 --- /dev/null +++ b/src/regex_based_matcher.rs @@ -0,0 +1,57 @@ +use log::{error}; + +use crate::{interfaces, proto_gen::phonemetadata::PhoneNumberDesc, regexp_cache::{self, RegexCache}}; + +pub struct RegexBasedMatcher { + cache: RegexCache, +} + +impl RegexBasedMatcher { + pub fn new() -> Self { + Self { cache: RegexCache::with_capacity(128) } + } + + fn match_number( + &self, phone_number: &str, + number_pattern: &str, + allow_prefix_match: bool + ) -> Result { + let regexp = self.cache.get_regex(number_pattern)?; + + // find first occurrence + if let Some(mat) = regexp.find(phone_number) { + // if first position is not matched none of scenarios are possible + if mat.start() != 0 { + return Ok(false); + } + // full match + if mat.end() == phone_number.len() { + return Ok(true); + } else if allow_prefix_match { + return Ok(true); + } + } + Ok(false) + } +} + +impl interfaces::MatcherApi for RegexBasedMatcher { + fn match_national_number( + &self, number: &str, + number_desc: &PhoneNumberDesc, + allow_prefix_match: bool + ) -> bool { + let national_number_pattern = number_desc.national_number_pattern(); + // We don't want to consider it a prefix match when matching non-empty input + // against an empty pattern. + if national_number_pattern.is_empty() { + return false; + } + if let Ok(res) = self.match_number(number, national_number_pattern, allow_prefix_match) { + res + } else { + error!("Invalid regex! {}", national_number_pattern); + false + } + } +} \ No newline at end of file