evergreen/norm.rs
1use icu_normalizer::DecomposingNormalizer;
2use regex::Regex;
3use std::sync::OnceLock;
4
5/// Store these globally to avoid repititive regex recompilation.
6static REGEX_CONTROL_CODES: OnceLock<Regex> = OnceLock::new();
7static REGEX_PUNCTUATION: OnceLock<Regex> = OnceLock::new();
8static REGEX_MULTI_SPACES: OnceLock<Regex> = OnceLock::new();
9
10const REGEX_CONTROL_CODES_PATTERN: &str = r#"[\p{Cc}\p{Cf}\p{Co}\p{Lm}\p{Mc}\p{Me}\p{Mn}]"#;
11const REGEX_PUNCTUATION_PATTERN: &str =
12 r#"[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]"#;
13
14/// As is, this struct is no longer necessary but retained for backwards compat.
15#[derive(Default)]
16pub struct Normalizer {}
17
18impl Normalizer {
19 /// Pre-compile our regular expressions.
20 ///
21 /// Ideally called once before threads are spawned.
22 pub fn init() {
23 // Treat multiple attempts to apply values to our regex
24 // oncelocks as non-errors, since it can happen if multiple
25 // threads call init() at practically the same time. However,
26 // exit as soon as we know any of the regexes have been applied.
27 if REGEX_CONTROL_CODES
28 .set(Regex::new(REGEX_CONTROL_CODES_PATTERN).unwrap())
29 .is_err()
30 {
31 return;
32 }
33
34 if REGEX_PUNCTUATION
35 .set(Regex::new(REGEX_PUNCTUATION_PATTERN).unwrap())
36 .is_err()
37 {
38 return;
39 }
40
41 REGEX_MULTI_SPACES.set(Regex::new("\\s+").unwrap()).ok();
42 }
43
44 pub fn new() -> Normalizer {
45 Default::default()
46 }
47
48 pub fn naco_normalize_once(value: &str) -> String {
49 Normalizer::new().naco_normalize(value)
50 }
51
52 /// See Evergreen/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
53 ///
54 /// # Examples
55 ///
56 /// ```
57 /// use evergreen::norm::Normalizer;
58 /// Normalizer::init();
59 ///
60 /// let normalizer = Normalizer::new();
61 /// assert_eq!(normalizer.naco_normalize("Café"), normalizer.naco_normalize("cafe"));
62 /// assert_eq!(
63 /// normalizer.naco_normalize(concat!("\u{009C}", "Pushkin")),
64 /// normalizer.naco_normalize("Pushkin")
65 /// );
66 /// assert_eq!(
67 /// normalizer.naco_normalize(concat!("Library", "\u{009C}")),
68 /// normalizer.naco_normalize("Library")
69 /// );
70 /// assert_eq!(normalizer.naco_normalize("‘Hello’"), normalizer.naco_normalize("Hello"));
71 /// assert_eq!(normalizer.naco_normalize("Ægis"), normalizer.naco_normalize("aegis"));
72 /// assert_eq!(normalizer.naco_normalize("Ryan, Pam Muñoz"), "ryan pam munoz");
73 /// ```
74 ///
75 /// # Panics
76 ///
77 /// Panics if Normalizer::init() is not called first.
78 pub fn naco_normalize(&self, value: &str) -> String {
79 let mut value = self.normalize_substitutions(value);
80 value = value.replace('\'', "");
81 self.normalize_codes(value)
82 }
83
84 fn normalize_substitutions(&self, value: &str) -> String {
85 let value = value
86 .to_uppercase()
87 // Start/End of string characters
88 .replace(['\u{0098}', '\u{009C}'], "")
89 // Single-quote-like characters
90 .replace(
91 ['\u{2018}', '\u{2019}', '\u{201B}', '\u{FF07}', '\u{201A}'],
92 "'",
93 )
94 // Double-quote-like characters
95 .replace(
96 [
97 '\u{201C}', '\u{201D}', '\u{201F}', '\u{FF0C}', '\u{201E}', '\u{2E42}',
98 ],
99 "\"",
100 );
101
102 let normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
103 let value = normalizer.normalize(&value);
104
105 // Additional substitutions
106 value
107 .replace('\u{00C6}', "AE")
108 .replace('\u{00DE}', "TH")
109 .replace('\u{0152}', "OE")
110 .replace(['\u{0110}', '\u{00D0}'], "D")
111 .replace('\u{00D8}', "O")
112 .replace('\u{0141}', "L")
113 .replace('\u{0142}', "l")
114 .replace(['\u{2113}', '\u{02BB}', '\u{02BC}'], "")
115 }
116
117 fn normalize_codes(&self, value: String) -> String {
118 let mut value = REGEX_CONTROL_CODES
119 .get()
120 .expect("Normalizer::init() should be called first")
121 .replace_all(&value, "")
122 .into_owned();
123
124 // Set aside some chars for safe keeping.
125 value = value
126 .replace('+', "\u{01}")
127 .replace('&', "\u{02}")
128 .replace('@', "\u{03}")
129 .replace('\u{266D}', "\u{04}")
130 .replace('\u{266F}', "\u{05}")
131 .replace('#', "\u{06}");
132
133 value = REGEX_PUNCTUATION
134 .get()
135 .expect("Normalizer::init() should be called first")
136 .replace_all(&value, " ")
137 .into_owned();
138
139 // Now put them back
140 value = value
141 .replace('\u{01}', "+")
142 .replace('\u{02}', "&")
143 .replace('\u{03}', "@")
144 .replace('\u{04}', "\u{266D}")
145 .replace('\u{05}', "\u{266F}")
146 .replace('\u{06}', "#");
147
148 // TODO decimal digits
149
150 /*
151 $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
152 */
153
154 value = REGEX_MULTI_SPACES
155 .get()
156 .expect("Normalizer::init() should be called first")
157 .replace_all(&value, " ")
158 .into_owned();
159
160 // leaing / trailing spaces
161 value.trim().to_lowercase()
162 }
163}