marctk/
xml.rs

1//! Routines for reading and writing MARC XML
2use std::fs::File;
3use std::io::BufReader;
4use std::io::Cursor;
5use xml::attribute::OwnedAttribute;
6use xml::reader::{EventReader, XmlEvent};
7
8use super::Controlfield;
9use super::Field;
10use super::Record;
11use super::Subfield;
12
13pub const MARCXML_NAMESPACE: &str = "http://www.loc.gov/MARC21/slim";
14pub const MARCXML_XSI_NAMESPACE: &str = "http://www.w3.org/2001/XMLSchema-instance";
15pub const MARCXML_SCHEMA_LOCATION: &str =
16    "http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd";
17
18/// Replace non-ASCII characters and special characters with escaped
19/// XML entities.
20///
21/// * is_attr - If true, also escape single and double quotes.
22///
23/// ```
24/// use marctk::xml;
25/// assert_eq!(xml::escape_xml("<'É'>", false).as_str(), "&lt;'&#xC9;'&gt;");
26/// assert_eq!(xml::escape_xml("<'É'>", true).as_str(), "&lt;&apos;&#xC9;&apos;&gt;");
27/// ```
28pub fn escape_xml(value: &str, is_attr: bool) -> String {
29    let mut buf = String::new();
30    for c in value.chars() {
31        if c == '&' {
32            buf.push_str("&amp;");
33        } else if c == '\'' && is_attr {
34            buf.push_str("&apos;");
35        } else if c == '"' && is_attr {
36            buf.push_str("&quot;");
37        } else if c == '>' {
38            buf.push_str("&gt;");
39        } else if c == '<' {
40            buf.push_str("&lt;");
41        } else if c > '~' {
42            let ord: u32 = c.into();
43            buf.push_str(format!("&#x{ord:X};").as_str());
44        } else {
45            buf.push(c);
46        }
47    }
48
49    buf
50}
51
52/// Append leading spaces for formatted XML.
53fn format(formatted: bool, value: &mut String, depth: u8) {
54    if formatted {
55        value.push('\n');
56        for _ in 0..depth {
57            value.push(' ');
58        }
59    }
60}
61
62/// Options for controling the format of XML output
63pub struct XmlOptions {
64    /// Format generated with 2-space indent.
65    pub formatted: bool,
66    /// Include an XML declaration in the generated XML.
67    pub with_xml_declaration: bool,
68}
69
70struct XmlParseContext {
71    record: Record,
72    in_cfield: bool,
73    in_subfield: bool,
74    in_leader: bool,
75    record_complete: bool,
76    doc_complete: bool,
77}
78
79pub enum XmlRecordIterator {
80    FileReader(EventReader<BufReader<File>>),
81    ByteReader(EventReader<Cursor<Vec<u8>>>),
82}
83
84impl Iterator for XmlRecordIterator {
85    type Item = Result<Record, String>;
86
87    fn next(&mut self) -> Option<Self::Item> {
88        let mut context = XmlParseContext {
89            record: Record::new(),
90            in_cfield: false,
91            in_subfield: false,
92            in_leader: false,
93            record_complete: false,
94            doc_complete: false,
95        };
96
97        self.read_next(&mut context).transpose()
98    }
99}
100
101impl XmlRecordIterator {
102    /// Create a new iterator from a MARC XML file
103    fn from_file(filename: &str) -> Result<Self, String> {
104        match File::open(filename) {
105            Ok(file) => Ok(XmlRecordIterator::FileReader(EventReader::new(
106                BufReader::new(file),
107            ))),
108            Err(e) => Err(format!("Cannot read MARCXML file: {filename} {e}")),
109        }
110    }
111
112    /// Create a new iterator from a MARC string
113    fn from_string(xml: &str) -> Self {
114        XmlRecordIterator::ByteReader(EventReader::new(Cursor::new(xml.as_bytes().to_vec())))
115    }
116
117    /// Pull the next Record from the data source.
118    fn read_next(&mut self, context: &mut XmlParseContext) -> Result<Option<Record>, String> {
119        loop {
120            let evt_res = match *self {
121                XmlRecordIterator::FileReader(ref mut reader) => reader.next(),
122                XmlRecordIterator::ByteReader(ref mut reader) => reader.next(),
123            };
124
125            let evt = evt_res.map_err(|e| format!("Error processing XML: {e}"))?;
126
127            if let Err(e) = self.handle_xml_event(context, evt) {
128                return Err(format!("Error processing XML: {e}"));
129            }
130
131            if context.record_complete {
132                // Return the compiled record and replace it with a new one.
133                return Ok(Some(std::mem::take(&mut context.record)));
134            } else if context.doc_complete {
135                // If we had a doc in progress, discard it.
136                context.record = Record::new();
137
138                // All done.  Get outta here.
139                return Ok(None);
140            }
141        }
142    }
143
144    /// Process a single XML read event
145    fn handle_xml_event(
146        &mut self,
147        context: &mut XmlParseContext,
148        evt: XmlEvent,
149    ) -> Result<(), String> {
150        let record = &mut context.record;
151
152        match evt {
153            XmlEvent::StartElement {
154                name, attributes, ..
155            } => {
156                self.handle_start_element(context, name.local_name.as_str(), &attributes)?;
157            }
158
159            XmlEvent::Characters(ref characters) => {
160                if context.in_leader {
161                    record.set_leader(characters)?;
162                    context.in_leader = false;
163                } else if context.in_cfield {
164                    if let Some(cf) = record.control_fields_mut().last_mut() {
165                        cf.set_content(characters);
166                    }
167                    context.in_cfield = false;
168                } else if context.in_subfield {
169                    if let Some(field) = record.fields_mut().last_mut() {
170                        if let Some(subfield) = field.subfields_mut().last_mut() {
171                            subfield.set_content(characters);
172                        }
173                    }
174                    context.in_subfield = false;
175                }
176            }
177
178            XmlEvent::EndElement { name, .. } => {
179                if name.local_name.as_str() == "record" {
180                    context.record_complete = true;
181                }
182            }
183
184            XmlEvent::EndDocument => {
185                context.doc_complete = true;
186            }
187
188            _ => {}
189        }
190
191        Ok(())
192    }
193
194    fn handle_start_element(
195        &mut self,
196        context: &mut XmlParseContext,
197        name: &str,
198        attributes: &Vec<OwnedAttribute>,
199    ) -> Result<(), String> {
200        let record = &mut context.record;
201
202        match name {
203            "leader" => context.in_leader = true,
204
205            "controlfield" => {
206                if let Some(t) = attributes.iter().find(|a| a.name.local_name.eq("tag")) {
207                    record
208                        .control_fields_mut()
209                        .push(Controlfield::new(&t.value, "")?);
210                    context.in_cfield = true;
211                } else {
212                    return Err("Controlfield has no tag".to_string());
213                }
214            }
215
216            "datafield" => {
217                let mut field = match attributes.iter().find(|a| a.name.local_name.eq("tag")) {
218                    Some(attr) => Field::new(&attr.value)?,
219                    None => {
220                        return Err("Data field has no tag".to_string());
221                    }
222                };
223
224                for attr in attributes {
225                    match attr.name.local_name.as_str() {
226                        "ind1" => field.set_ind1(&attr.value)?,
227                        "ind2" => field.set_ind2(&attr.value)?,
228                        _ => {}
229                    }
230                }
231
232                record.fields_mut().push(field);
233            }
234
235            "subfield" => {
236                let field_op = record.fields_mut().last_mut();
237
238                if field_op.is_none() {
239                    return Err("Encounted <subfield/> without a field".to_string());
240                }
241
242                let field = field_op.unwrap();
243                for attr in attributes {
244                    if attr.name.local_name.eq("code") {
245                        context.in_subfield = true;
246                        field.subfields_mut().push(Subfield::new(&attr.value, "")?);
247                        break;
248                    }
249                }
250            }
251            _ => {}
252        }
253
254        Ok(())
255    }
256}
257
258impl Record {
259    /// Returns an iterator over the XML file which emits Records.
260    pub fn from_xml_file(filename: &str) -> Result<XmlRecordIterator, String> {
261        XmlRecordIterator::from_file(filename)
262    }
263
264    /// Returns an iterator over the XML string which emits Records.
265    ///
266    /// It can parse MarcXML strings, whether or not they have the appropriate
267    /// XML namespace (`http://www.loc.gov/MARC21/slim`).
268    ///
269    /// # Examples
270    ///
271    /// ```
272    /// use marctk::Record;
273    ///
274    /// let iterator = Record::from_xml(r#"<collection>
275    ///   <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">First title</subfield></datafield></record>
276    ///   <record xmlns="http://www.loc.gov/MARC21/slim"><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Second title</subfield></datafield></record>
277    /// </collection>"#);
278    ///
279    /// let values: Vec<String> = iterator.map(|item| item.unwrap().get_field_values("245", "a")[0].to_owned())
280    ///     .collect();
281    /// assert_eq!(values, ["First title".to_string(), "Second title".to_string()]);
282    /// ```
283    pub fn from_xml(xml: &str) -> XmlRecordIterator {
284        XmlRecordIterator::from_string(xml)
285    }
286
287    #[deprecated(note = "See to_xml_string()")]
288    pub fn to_xml(&self) -> String {
289        self.to_xml_string()
290    }
291
292    /// Creates an XML string from a [`Record`]
293    pub fn to_xml_string(&self) -> String {
294        self.to_xml_string_ops(&XmlOptions {
295            formatted: false,
296            with_xml_declaration: false,
297        })
298    }
299
300    #[deprecated(note = "See to_xml_string_formatted()")]
301    pub fn to_xml_formatted(&self) -> String {
302        self.to_xml_string_formatted()
303    }
304
305    /// Creates an XML string from a [`Record`] formatted with 2-space indents.
306    pub fn to_xml_string_formatted(&self) -> String {
307        self.to_xml_string_ops(&XmlOptions {
308            formatted: true,
309            with_xml_declaration: false,
310        })
311    }
312
313    #[deprecated(note = "See to_xml_string_ops()")]
314    pub fn to_xml_ops(&self, options: &XmlOptions) -> String {
315        self.to_xml_string_ops(options)
316    }
317
318    /// Creates an XML string from a [`Record`] using the provided options.
319    pub fn to_xml_string_ops(&self, options: &XmlOptions) -> String {
320        // We could use XmlWriter here, but manual creation works fine
321        // and offers more flexibility.
322
323        let mut xml = match options.with_xml_declaration {
324            true => String::from(r#"<?xml version="1.0"?>"#),
325            _ => String::new(),
326        };
327
328        // Document root
329
330        if options.formatted {
331            xml += &format!(
332                "\n<record\n  xmlns=\"{}\"\n  xmlns:xsi=\"{}\"\n  xsi:schemaLocation=\"{}\">",
333                MARCXML_NAMESPACE, MARCXML_XSI_NAMESPACE, MARCXML_SCHEMA_LOCATION
334            );
335        } else {
336            xml += &format!(
337                r#"<record xmlns="{}" xmlns:xsi="{}" xsi:schemaLocation="{}">"#,
338                MARCXML_NAMESPACE, MARCXML_XSI_NAMESPACE, MARCXML_SCHEMA_LOCATION
339            );
340        }
341
342        // Leader
343
344        format(options.formatted, &mut xml, 2);
345        xml += &format!("<leader>{}</leader>", &escape_xml(self.leader(), false));
346
347        // Control Fields
348
349        for cfield in self.control_fields() {
350            format(options.formatted, &mut xml, 2);
351
352            xml += &format!(
353                r#"<controlfield tag="{}">{}</controlfield>"#,
354                escape_xml(cfield.tag(), true),
355                escape_xml(cfield.content(), false),
356            );
357        }
358
359        // Data Fields
360
361        for field in self.fields() {
362            format(options.formatted, &mut xml, 2);
363
364            xml += &format!(
365                r#"<datafield tag="{}" ind1="{}" ind2="{}">"#,
366                escape_xml(field.tag(), true),
367                escape_xml(field.ind1(), true),
368                escape_xml(field.ind2(), true),
369            );
370
371            for sf in field.subfields() {
372                format(options.formatted, &mut xml, 4);
373
374                xml += &format!(
375                    r#"<subfield code="{}">{}</subfield>"#,
376                    &escape_xml(sf.code(), true),
377                    &escape_xml(sf.content(), false)
378                );
379            }
380
381            format(options.formatted, &mut xml, 2);
382
383            xml += "</datafield>";
384        }
385
386        format(options.formatted, &mut xml, 0);
387
388        xml += "</record>";
389
390        xml
391    }
392}
393
394#[cfg(test)]
395mod tests {
396    use super::*;
397
398    #[test]
399    fn test_can_parse_xml_string_with_namespace() {
400        let iterator = Record::from_xml(
401            r#"<collection xmlns="http://www.loc.gov/MARC21/slim">
402                <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">First title</subfield></datafield></record>
403                <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Second title</subfield></datafield></record>
404            </collection>"#,
405        );
406        let values: Vec<String> = iterator
407            .map(|item| item.unwrap().get_field_values("245", "a")[0].to_owned())
408            .collect();
409        assert_eq!(
410            values,
411            ["First title".to_string(), "Second title".to_string()]
412        );
413    }
414
415    #[test]
416    fn test_can_parse_xml_string_without_namespace() {
417        let iterator = Record::from_xml(
418            r#"<collection>
419                <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">First title</subfield></datafield></record>
420                <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Second title</subfield></datafield></record>
421            </collection>"#,
422        );
423        let values: Vec<String> = iterator
424            .map(|item| item.unwrap().get_field_values("245", "a")[0].to_owned())
425            .collect();
426        assert_eq!(
427            values,
428            ["First title".to_string(), "Second title".to_string()]
429        );
430    }
431
432    #[test]
433    fn test_can_parse_xml_string_without_collection() {
434        let iterator = Record::from_xml(
435            r#"<record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">First title</subfield></datafield></record>
436                <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Second title</subfield></datafield></record>"#,
437        );
438        let values: Vec<String> = iterator
439            .map(|item| item.unwrap().get_field_values("245", "a")[0].to_owned())
440            .collect();
441        assert_eq!(
442            values,
443            ["First title".to_string(), "Second title".to_string()]
444        );
445    }
446}