1use std::fs::File;
3use std::io::BufReader;
4use std::io::Cursor;
5use xml::attribute::OwnedAttribute;
6use xml::reader::{EventReader, XmlEvent};
7
8use super::Controlfield;
9use super::Field;
10use super::Record;
11use super::Subfield;
12
13pub const MARCXML_NAMESPACE: &str = "http://www.loc.gov/MARC21/slim";
14pub const MARCXML_XSI_NAMESPACE: &str = "http://www.w3.org/2001/XMLSchema-instance";
15pub const MARCXML_SCHEMA_LOCATION: &str =
16 "http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd";
17
18pub fn escape_xml(value: &str, is_attr: bool) -> String {
29 let mut buf = String::new();
30 for c in value.chars() {
31 if c == '&' {
32 buf.push_str("&");
33 } else if c == '\'' && is_attr {
34 buf.push_str("'");
35 } else if c == '"' && is_attr {
36 buf.push_str(""");
37 } else if c == '>' {
38 buf.push_str(">");
39 } else if c == '<' {
40 buf.push_str("<");
41 } else if c > '~' {
42 let ord: u32 = c.into();
43 buf.push_str(format!("&#x{ord:X};").as_str());
44 } else {
45 buf.push(c);
46 }
47 }
48
49 buf
50}
51
52fn format(formatted: bool, value: &mut String, depth: u8) {
54 if formatted {
55 value.push('\n');
56 for _ in 0..depth {
57 value.push(' ');
58 }
59 }
60}
61
62pub struct XmlOptions {
64 pub formatted: bool,
66 pub with_xml_declaration: bool,
68}
69
70struct XmlParseContext {
71 record: Record,
72 in_cfield: bool,
73 in_subfield: bool,
74 in_leader: bool,
75 record_complete: bool,
76 doc_complete: bool,
77}
78
79pub enum XmlRecordIterator {
80 FileReader(EventReader<BufReader<File>>),
81 ByteReader(EventReader<Cursor<Vec<u8>>>),
82}
83
84impl Iterator for XmlRecordIterator {
85 type Item = Result<Record, String>;
86
87 fn next(&mut self) -> Option<Self::Item> {
88 let mut context = XmlParseContext {
89 record: Record::new(),
90 in_cfield: false,
91 in_subfield: false,
92 in_leader: false,
93 record_complete: false,
94 doc_complete: false,
95 };
96
97 self.read_next(&mut context).transpose()
98 }
99}
100
101impl XmlRecordIterator {
102 fn from_file(filename: &str) -> Result<Self, String> {
104 match File::open(filename) {
105 Ok(file) => Ok(XmlRecordIterator::FileReader(EventReader::new(
106 BufReader::new(file),
107 ))),
108 Err(e) => Err(format!("Cannot read MARCXML file: {filename} {e}")),
109 }
110 }
111
112 fn from_string(xml: &str) -> Self {
114 XmlRecordIterator::ByteReader(EventReader::new(Cursor::new(xml.as_bytes().to_vec())))
115 }
116
117 fn read_next(&mut self, context: &mut XmlParseContext) -> Result<Option<Record>, String> {
119 loop {
120 let evt_res = match *self {
121 XmlRecordIterator::FileReader(ref mut reader) => reader.next(),
122 XmlRecordIterator::ByteReader(ref mut reader) => reader.next(),
123 };
124
125 let evt = evt_res.map_err(|e| format!("Error processing XML: {e}"))?;
126
127 if let Err(e) = self.handle_xml_event(context, evt) {
128 return Err(format!("Error processing XML: {e}"));
129 }
130
131 if context.record_complete {
132 return Ok(Some(std::mem::take(&mut context.record)));
134 } else if context.doc_complete {
135 context.record = Record::new();
137
138 return Ok(None);
140 }
141 }
142 }
143
144 fn handle_xml_event(
146 &mut self,
147 context: &mut XmlParseContext,
148 evt: XmlEvent,
149 ) -> Result<(), String> {
150 let record = &mut context.record;
151
152 match evt {
153 XmlEvent::StartElement {
154 name, attributes, ..
155 } => {
156 self.handle_start_element(context, name.local_name.as_str(), &attributes)?;
157 }
158
159 XmlEvent::Characters(ref characters) => {
160 if context.in_leader {
161 record.set_leader(characters)?;
162 context.in_leader = false;
163 } else if context.in_cfield {
164 if let Some(cf) = record.control_fields_mut().last_mut() {
165 cf.set_content(characters);
166 }
167 context.in_cfield = false;
168 } else if context.in_subfield {
169 if let Some(field) = record.fields_mut().last_mut() {
170 if let Some(subfield) = field.subfields_mut().last_mut() {
171 subfield.set_content(characters);
172 }
173 }
174 context.in_subfield = false;
175 }
176 }
177
178 XmlEvent::EndElement { name, .. } => {
179 if name.local_name.as_str() == "record" {
180 context.record_complete = true;
181 }
182 }
183
184 XmlEvent::EndDocument => {
185 context.doc_complete = true;
186 }
187
188 _ => {}
189 }
190
191 Ok(())
192 }
193
194 fn handle_start_element(
195 &mut self,
196 context: &mut XmlParseContext,
197 name: &str,
198 attributes: &Vec<OwnedAttribute>,
199 ) -> Result<(), String> {
200 let record = &mut context.record;
201
202 match name {
203 "leader" => context.in_leader = true,
204
205 "controlfield" => {
206 if let Some(t) = attributes.iter().find(|a| a.name.local_name.eq("tag")) {
207 record
208 .control_fields_mut()
209 .push(Controlfield::new(&t.value, "")?);
210 context.in_cfield = true;
211 } else {
212 return Err("Controlfield has no tag".to_string());
213 }
214 }
215
216 "datafield" => {
217 let mut field = match attributes.iter().find(|a| a.name.local_name.eq("tag")) {
218 Some(attr) => Field::new(&attr.value)?,
219 None => {
220 return Err("Data field has no tag".to_string());
221 }
222 };
223
224 for attr in attributes {
225 match attr.name.local_name.as_str() {
226 "ind1" => field.set_ind1(&attr.value)?,
227 "ind2" => field.set_ind2(&attr.value)?,
228 _ => {}
229 }
230 }
231
232 record.fields_mut().push(field);
233 }
234
235 "subfield" => {
236 let field_op = record.fields_mut().last_mut();
237
238 if field_op.is_none() {
239 return Err("Encounted <subfield/> without a field".to_string());
240 }
241
242 let field = field_op.unwrap();
243 for attr in attributes {
244 if attr.name.local_name.eq("code") {
245 context.in_subfield = true;
246 field.subfields_mut().push(Subfield::new(&attr.value, "")?);
247 break;
248 }
249 }
250 }
251 _ => {}
252 }
253
254 Ok(())
255 }
256}
257
258impl Record {
259 pub fn from_xml_file(filename: &str) -> Result<XmlRecordIterator, String> {
261 XmlRecordIterator::from_file(filename)
262 }
263
264 pub fn from_xml(xml: &str) -> XmlRecordIterator {
284 XmlRecordIterator::from_string(xml)
285 }
286
287 #[deprecated(note = "See to_xml_string()")]
288 pub fn to_xml(&self) -> String {
289 self.to_xml_string()
290 }
291
292 pub fn to_xml_string(&self) -> String {
294 self.to_xml_string_ops(&XmlOptions {
295 formatted: false,
296 with_xml_declaration: false,
297 })
298 }
299
300 #[deprecated(note = "See to_xml_string_formatted()")]
301 pub fn to_xml_formatted(&self) -> String {
302 self.to_xml_string_formatted()
303 }
304
305 pub fn to_xml_string_formatted(&self) -> String {
307 self.to_xml_string_ops(&XmlOptions {
308 formatted: true,
309 with_xml_declaration: false,
310 })
311 }
312
313 #[deprecated(note = "See to_xml_string_ops()")]
314 pub fn to_xml_ops(&self, options: &XmlOptions) -> String {
315 self.to_xml_string_ops(options)
316 }
317
318 pub fn to_xml_string_ops(&self, options: &XmlOptions) -> String {
320 let mut xml = match options.with_xml_declaration {
324 true => String::from(r#"<?xml version="1.0"?>"#),
325 _ => String::new(),
326 };
327
328 if options.formatted {
331 xml += &format!(
332 "\n<record\n xmlns=\"{}\"\n xmlns:xsi=\"{}\"\n xsi:schemaLocation=\"{}\">",
333 MARCXML_NAMESPACE, MARCXML_XSI_NAMESPACE, MARCXML_SCHEMA_LOCATION
334 );
335 } else {
336 xml += &format!(
337 r#"<record xmlns="{}" xmlns:xsi="{}" xsi:schemaLocation="{}">"#,
338 MARCXML_NAMESPACE, MARCXML_XSI_NAMESPACE, MARCXML_SCHEMA_LOCATION
339 );
340 }
341
342 format(options.formatted, &mut xml, 2);
345 xml += &format!("<leader>{}</leader>", &escape_xml(self.leader(), false));
346
347 for cfield in self.control_fields() {
350 format(options.formatted, &mut xml, 2);
351
352 xml += &format!(
353 r#"<controlfield tag="{}">{}</controlfield>"#,
354 escape_xml(cfield.tag(), true),
355 escape_xml(cfield.content(), false),
356 );
357 }
358
359 for field in self.fields() {
362 format(options.formatted, &mut xml, 2);
363
364 xml += &format!(
365 r#"<datafield tag="{}" ind1="{}" ind2="{}">"#,
366 escape_xml(field.tag(), true),
367 escape_xml(field.ind1(), true),
368 escape_xml(field.ind2(), true),
369 );
370
371 for sf in field.subfields() {
372 format(options.formatted, &mut xml, 4);
373
374 xml += &format!(
375 r#"<subfield code="{}">{}</subfield>"#,
376 &escape_xml(sf.code(), true),
377 &escape_xml(sf.content(), false)
378 );
379 }
380
381 format(options.formatted, &mut xml, 2);
382
383 xml += "</datafield>";
384 }
385
386 format(options.formatted, &mut xml, 0);
387
388 xml += "</record>";
389
390 xml
391 }
392}
393
394#[cfg(test)]
395mod tests {
396 use super::*;
397
398 #[test]
399 fn test_can_parse_xml_string_with_namespace() {
400 let iterator = Record::from_xml(
401 r#"<collection xmlns="http://www.loc.gov/MARC21/slim">
402 <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">First title</subfield></datafield></record>
403 <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Second title</subfield></datafield></record>
404 </collection>"#,
405 );
406 let values: Vec<String> = iterator
407 .map(|item| item.unwrap().get_field_values("245", "a")[0].to_owned())
408 .collect();
409 assert_eq!(
410 values,
411 ["First title".to_string(), "Second title".to_string()]
412 );
413 }
414
415 #[test]
416 fn test_can_parse_xml_string_without_namespace() {
417 let iterator = Record::from_xml(
418 r#"<collection>
419 <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">First title</subfield></datafield></record>
420 <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Second title</subfield></datafield></record>
421 </collection>"#,
422 );
423 let values: Vec<String> = iterator
424 .map(|item| item.unwrap().get_field_values("245", "a")[0].to_owned())
425 .collect();
426 assert_eq!(
427 values,
428 ["First title".to_string(), "Second title".to_string()]
429 );
430 }
431
432 #[test]
433 fn test_can_parse_xml_string_without_collection() {
434 let iterator = Record::from_xml(
435 r#"<record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">First title</subfield></datafield></record>
436 <record><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Second title</subfield></datafield></record>"#,
437 );
438 let values: Vec<String> = iterator
439 .map(|item| item.unwrap().get_field_values("245", "a")[0].to_owned())
440 .collect();
441 assert_eq!(
442 values,
443 ["First title".to_string(), "Second title".to_string()]
444 );
445 }
446}