evergreen/
norm.rs

1use icu_normalizer::DecomposingNormalizer;
2use regex::Regex;
3use std::sync::OnceLock;
4
5/// Store these globally to avoid repititive regex recompilation.
6static REGEX_CONTROL_CODES: OnceLock<Regex> = OnceLock::new();
7static REGEX_PUNCTUATION: OnceLock<Regex> = OnceLock::new();
8static REGEX_MULTI_SPACES: OnceLock<Regex> = OnceLock::new();
9
10const REGEX_CONTROL_CODES_PATTERN: &str = r#"[\p{Cc}\p{Cf}\p{Co}\p{Lm}\p{Mc}\p{Me}\p{Mn}]"#;
11const REGEX_PUNCTUATION_PATTERN: &str =
12    r#"[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]"#;
13
14/// As is, this struct is no longer necessary but retained for backwards compat.
15#[derive(Default)]
16pub struct Normalizer {}
17
18impl Normalizer {
19    /// Pre-compile our regular expressions.
20    ///
21    /// Ideally called once before threads are spawned.
22    pub fn init() {
23        // Treat multiple attempts to apply values to our regex
24        // oncelocks as non-errors, since it can happen if multiple
25        // threads call init() at practically the same time. However,
26        // exit as soon as we know any of the regexes have been applied.
27        if REGEX_CONTROL_CODES
28            .set(Regex::new(REGEX_CONTROL_CODES_PATTERN).unwrap())
29            .is_err()
30        {
31            return;
32        }
33
34        if REGEX_PUNCTUATION
35            .set(Regex::new(REGEX_PUNCTUATION_PATTERN).unwrap())
36            .is_err()
37        {
38            return;
39        }
40
41        REGEX_MULTI_SPACES.set(Regex::new("\\s+").unwrap()).ok();
42    }
43
44    pub fn new() -> Normalizer {
45        Default::default()
46    }
47
48    pub fn naco_normalize_once(value: &str) -> String {
49        Normalizer::new().naco_normalize(value)
50    }
51
52    /// See Evergreen/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
53    ///
54    /// # Examples
55    ///
56    /// ```
57    /// use evergreen::norm::Normalizer;
58    /// Normalizer::init();
59    ///
60    /// let normalizer = Normalizer::new();
61    /// assert_eq!(normalizer.naco_normalize("Café"), normalizer.naco_normalize("cafe"));
62    /// assert_eq!(
63    ///     normalizer.naco_normalize(concat!("\u{009C}", "Pushkin")),
64    ///     normalizer.naco_normalize("Pushkin")
65    /// );
66    /// assert_eq!(
67    ///     normalizer.naco_normalize(concat!("Library", "\u{009C}")),
68    ///     normalizer.naco_normalize("Library")
69    /// );
70    /// assert_eq!(normalizer.naco_normalize("‘Hello’"), normalizer.naco_normalize("Hello"));
71    /// assert_eq!(normalizer.naco_normalize("Ægis"), normalizer.naco_normalize("aegis"));
72    /// assert_eq!(normalizer.naco_normalize("Ryan, Pam Muñoz"), "ryan pam munoz");
73    /// ```
74    ///
75    /// # Panics
76    ///
77    /// Panics if Normalizer::init() is not called first.
78    pub fn naco_normalize(&self, value: &str) -> String {
79        let mut value = self.normalize_substitutions(value);
80        value = value.replace('\'', "");
81        self.normalize_codes(value)
82    }
83
84    fn normalize_substitutions(&self, value: &str) -> String {
85        let value = value
86            .to_uppercase()
87            // Start/End of string characters
88            .replace(['\u{0098}', '\u{009C}'], "")
89            // Single-quote-like characters
90            .replace(
91                ['\u{2018}', '\u{2019}', '\u{201B}', '\u{FF07}', '\u{201A}'],
92                "'",
93            )
94            // Double-quote-like characters
95            .replace(
96                [
97                    '\u{201C}', '\u{201D}', '\u{201F}', '\u{FF0C}', '\u{201E}', '\u{2E42}',
98                ],
99                "\"",
100            );
101
102        let normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
103        let value = normalizer.normalize(&value);
104
105        // Additional substitutions
106        value
107            .replace('\u{00C6}', "AE")
108            .replace('\u{00DE}', "TH")
109            .replace('\u{0152}', "OE")
110            .replace(['\u{0110}', '\u{00D0}'], "D")
111            .replace('\u{00D8}', "O")
112            .replace('\u{0141}', "L")
113            .replace('\u{0142}', "l")
114            .replace(['\u{2113}', '\u{02BB}', '\u{02BC}'], "")
115    }
116
117    fn normalize_codes(&self, value: String) -> String {
118        let mut value = REGEX_CONTROL_CODES
119            .get()
120            .expect("Normalizer::init() should be called first")
121            .replace_all(&value, "")
122            .into_owned();
123
124        // Set aside some chars for safe keeping.
125        value = value
126            .replace('+', "\u{01}")
127            .replace('&', "\u{02}")
128            .replace('@', "\u{03}")
129            .replace('\u{266D}', "\u{04}")
130            .replace('\u{266F}', "\u{05}")
131            .replace('#', "\u{06}");
132
133        value = REGEX_PUNCTUATION
134            .get()
135            .expect("Normalizer::init() should be called first")
136            .replace_all(&value, " ")
137            .into_owned();
138
139        // Now put them back
140        value = value
141            .replace('\u{01}', "+")
142            .replace('\u{02}', "&")
143            .replace('\u{03}', "@")
144            .replace('\u{04}', "\u{266D}")
145            .replace('\u{05}', "\u{266F}")
146            .replace('\u{06}', "#");
147
148        // TODO decimal digits
149
150        /*
151        $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
152        */
153
154        value = REGEX_MULTI_SPACES
155            .get()
156            .expect("Normalizer::init() should be called first")
157            .replace_all(&value, " ")
158            .into_owned();
159
160        // leaing / trailing spaces
161        value.trim().to_lowercase()
162    }
163}