marctk/
binary.rs

1//! Routines for reading and writing binary MARC data.
2use super::Controlfield;
3use super::Field;
4use super::Record;
5use super::Subfield;
6use std::fs::File;
7use std::io::prelude::*;
8
9const END_OF_FIELD: u8 = 30; // '\x1E';
10const END_OF_RECORD: u8 = 29; // '\x1D';
11const RECORD_SIZE_ENTRY: usize = 5;
12const LEADER_SIZE: usize = 24;
13const DATA_OFFSET_START: usize = 12;
14const DATA_OFFSET_SIZE: usize = 5;
15const DATA_LENGTH_SIZE: usize = 4;
16const DIRECTORY_ENTRY_LEN: usize = 12;
17const SUBFIELD_SEPARATOR: &str = "\x1F";
18const MAX_RECORD_BYTES: usize = 99999;
19
20/// Parses a binary MARC file and emits [`Record`] values.
21pub struct BinaryRecordIterator {
22    file: File,
23}
24
25impl Iterator for BinaryRecordIterator {
26    type Item = Result<Record, String>;
27
28    /// Returns the next [`Record`] extracted from the binary content.
29    fn next(&mut self) -> Option<Self::Item> {
30        let mut bytes: Vec<u8> = Vec::new();
31
32        loop {
33            // Read the file one byte at a time until we encounter an
34            // END_OF_RECORD byte.  Pass the bytes to the Record binary parser.
35
36            let mut buf: [u8; 1] = [0];
37            match self.file.read(&mut buf) {
38                Ok(count) => {
39                    if count == 1 {
40                        bytes.push(buf[0]);
41                        if buf[0] == END_OF_RECORD {
42                            break;
43                        }
44                    } else {
45                        break; // EOF
46                    }
47                }
48                Err(e) => {
49                    return Some(Err(format!("Error reading file: {:?} {}", self.file, e)));
50                }
51            }
52        }
53
54        if !bytes.is_empty() {
55            match Record::from_binary(bytes.as_slice()) {
56                Ok(r) => return Some(Ok(r)),
57                Err(e) => return Some(Err(format!("Error processing bytes: {:?} {}", bytes, e))),
58            }
59        }
60
61        None
62    }
63}
64
65impl BinaryRecordIterator {
66    /// Create a new [`BinaryRecordIterator`] from a file
67    fn from_file(filename: &str) -> Result<Self, String> {
68        let file = match File::open(filename) {
69            Ok(f) => f,
70            Err(e) => return Err(format!("Cannot read MARC file: {filename} {e}")),
71        };
72
73        Ok(BinaryRecordIterator { file })
74    }
75}
76
77/// Translates a slice of bytes into a String which represents a number,
78/// then extracts and returns the number.
79///
80/// ```
81/// use marctk::binary;
82///
83/// let b = &[49, 50, 51, 52]; // "1234"
84/// assert_eq!(binary::bytes_to_usize(b), Ok(1234))
85/// ```
86pub fn bytes_to_usize(bytes: &[u8]) -> Result<usize, String> {
87    match std::str::from_utf8(bytes) {
88        Ok(bytes_str) => match bytes_str.parse::<usize>() {
89            Ok(num) => Ok(num),
90            Err(e) => Err(format!(
91                "Error translating string to usize str={bytes_str} {e}"
92            )),
93        },
94        Err(e) => Err(format!("Error translating bytes to string: {bytes:?} {e}")),
95    }
96}
97
98/// Models the position/size data for a single, variable-length (control
99/// or data) field.
100pub struct DirectoryEntry {
101    tag: String,
102    field_start_idx: usize,
103    field_end_idx: usize,
104}
105
106impl DirectoryEntry {
107    /// Create a new directory entry from a set of bytes and positional
108    /// information.
109    ///
110    /// * `which` - Which entry is this in the directory
111    /// * `data_start_idx` - Where in the record as a whole does the data
112    ///   we care about start.
113    /// * `dir_bytes` - Bytes of this directory entry.
114    ///
115    /// # References
116    ///
117    /// * <https://www.loc.gov/marc/bibliographic/bddirectory.html>
118    pub fn new(which: usize, data_start_idx: usize, dir_bytes: &[u8]) -> Result<Self, String> {
119        let start = which * DIRECTORY_ENTRY_LEN;
120        let end = start + DIRECTORY_ENTRY_LEN;
121        let bytes = &dir_bytes[start..end];
122
123        let entry_str = match std::str::from_utf8(bytes) {
124            Ok(s) => s,
125            Err(e) => return Err(format!("Invalid directory bytes: {:?} {}", bytes, e)),
126        };
127
128        let field_tag = &entry_str[0..3];
129        let field_len_str = &entry_str[3..7];
130        let field_pos_str = &entry_str[7..12];
131
132        let field_len = match field_len_str.parse::<usize>() {
133            Ok(l) => l,
134            Err(e) => return Err(format!("Invalid data length value {} {}", field_len_str, e)),
135        };
136
137        // Where does this field start in the record as a whole
138        let field_start_idx = match field_pos_str.parse::<usize>() {
139            Ok(l) => l,
140            Err(e) => {
141                return Err(format!(
142                    "Invalid data position value {} {}",
143                    field_pos_str, e
144                ));
145            }
146        };
147
148        let start = field_start_idx + data_start_idx;
149        let last = start + field_len - 1; // Discard END_OF_FIELD char
150
151        Ok(DirectoryEntry {
152            tag: field_tag.to_string(),
153            field_start_idx: start,
154            field_end_idx: last,
155        })
156    }
157}
158
159impl Record {
160    /// Returns an iterator over MARC records produced from a binary file.
161    pub fn from_binary_file(filename: &str) -> Result<BinaryRecordIterator, String> {
162        BinaryRecordIterator::from_file(filename)
163    }
164
165    /// Creates a single MARC Record from a series of bytes.
166    ///
167    /// # References
168    ///
169    /// * <https://www.loc.gov/marc/bibliographic/bdleader.html>
170    /// * <https://www.loc.gov/marc/bibliographic/bddirectory.html>
171    pub fn from_binary(rec_bytes: &[u8]) -> Result<Record, String> {
172        let mut record = Record::new();
173
174        let rec_byte_count = rec_bytes.len();
175
176        if rec_byte_count < LEADER_SIZE {
177            return Err(format!("Binary record is too short: {:?}", rec_bytes));
178        }
179
180        let leader_bytes = &rec_bytes[0..LEADER_SIZE];
181
182        // Reported size of the record byte chunk
183        let size_bytes = &leader_bytes[0..RECORD_SIZE_ENTRY];
184
185        // Repported size of the record as a number
186        let rec_size = bytes_to_usize(size_bytes)?;
187
188        if rec_byte_count != rec_size {
189            return Err(format!(
190                "Record has incorrect size reported={} real={}",
191                rec_size, rec_byte_count
192            ));
193        }
194
195        record.set_leader_bytes(leader_bytes)?;
196
197        // Where in this pile of bytes do the control/data fields tart.
198        let data_offset_bytes =
199            &leader_bytes[DATA_OFFSET_START..(DATA_OFFSET_START + DATA_OFFSET_SIZE)];
200
201        let data_start_idx = bytes_to_usize(data_offset_bytes)?;
202
203        // The full directory as bytes.
204        // -1 to skip the END_OF_FIELD
205        let dir_bytes = &rec_bytes[LEADER_SIZE..(data_start_idx - 1)];
206
207        // Directory byte length should be divisible by the directry entry length.
208        let dir_len = dir_bytes.len();
209        if dir_len == 0 || dir_len % DIRECTORY_ENTRY_LEN != 0 {
210            return Err(format!("Invalid directory length {}", dir_len));
211        }
212
213        // How many directory entries are in this record.
214        let dir_count = dir_bytes.len() / DIRECTORY_ENTRY_LEN;
215        let mut dir_idx = 0;
216
217        while dir_idx < dir_count {
218            let dir_entry = DirectoryEntry::new(dir_idx, data_start_idx, dir_bytes)?;
219
220            if let Err(e) = record.process_directory_entry(rec_bytes, rec_byte_count, &dir_entry) {
221                return Err(format!(
222                    "Error processing directory entry index={} {}",
223                    dir_idx, e
224                ));
225            }
226
227            dir_idx += 1;
228        }
229
230        Ok(record)
231    }
232
233    /// Unpack a single control field / data field and append to the
234    /// record in progress.
235    ///
236    /// # References
237    ///
238    /// * <https://www.loc.gov/marc/bibliographic/bddirectory.html>
239    fn process_directory_entry(
240        &mut self,
241        rec_bytes: &[u8],      // full record as bytes
242        rec_byte_count: usize, // full size of record
243        dir_entry: &DirectoryEntry,
244    ) -> Result<(), String> {
245        if (dir_entry.field_end_idx) >= rec_byte_count {
246            return Err(format!(
247                "Field length exceeds length of record for tag={}",
248                dir_entry.tag
249            ));
250        }
251
252        // Extract the bytes for this directory entry from the directory.
253        let field_bytes = &rec_bytes[dir_entry.field_start_idx..dir_entry.field_end_idx];
254
255        // Turn said bytes into a string
256        let field_str = match std::str::from_utf8(field_bytes) {
257            Ok(s) => s,
258            Err(e) => {
259                return Err(format!(
260                    "Field data is not UTF-8 compatible: {:?} {}",
261                    field_bytes, e
262                ));
263            }
264        };
265
266        if dir_entry.tag.as_str() < "010" {
267            let content = if !field_str.is_empty() { field_str } else { "" };
268
269            let cf = Controlfield::new(&dir_entry.tag, content)?;
270            self.control_fields_mut().push(cf);
271            return Ok(());
272        }
273
274        // 3-bytes for tag
275        // 1 byte for indicator 1
276        // 1 byte for indicator 2
277        let mut field = Field::new(&dir_entry.tag)?;
278
279        field.set_ind1(&field_str[..1])?;
280        field.set_ind2(&field_str[1..2])?;
281
282        // Split the remainder on the subfield separator and
283        // build Field's from them.
284        let field_parts: Vec<&str> = field_str.split(SUBFIELD_SEPARATOR).collect();
285
286        for part in &field_parts[1..] {
287            // skip the initial SUBFIELD_SEPARATOR
288            let sf = Subfield::new(&part[..1], if part.len() > 1 { &part[1..] } else { "" })?;
289            field.subfields_mut().push(sf);
290        }
291
292        self.fields_mut().push(field);
293
294        Ok(())
295    }
296
297    /// Generates the binary form of a MARC record as a vector of bytes.
298    ///
299    /// # Examples
300    /// ```
301    /// use marctk::Record;
302    /// let mut my_record = Record::new();
303    /// my_record
304    ///     .add_data_field("245")
305    ///     .unwrap()
306    ///     .add_subfield("a", "My favorite book")
307    ///     .unwrap();
308    /// assert_eq!(
309    ///     my_record.to_binary().unwrap(),
310    ///     "00059       00037       245002100000\x1E  \x1FaMy favorite book\x1E\x1D".as_bytes()
311    /// );
312    /// ```
313    pub fn to_binary(&self) -> Result<Vec<u8>, String> {
314        let mut bytes: Vec<u8> = Vec::new();
315
316        bytes.append(&mut self.leader().as_bytes().to_vec());
317
318        // Directory
319        let num_dirs = self.build_directory(&mut bytes);
320
321        // End-of-field after Directory
322        bytes.push(END_OF_FIELD);
323
324        self.add_data_fields(&mut bytes);
325
326        // End-of-record after all data fields are added
327        bytes.push(END_OF_RECORD);
328
329        // Make sure the size and data offset for the leader match.
330        self.sync_leader(num_dirs, &mut bytes)?;
331
332        Ok(bytes)
333    }
334
335    /// Compile the directory entries for the control fields and data fields.
336    ///
337    /// # References
338    ///
339    /// * <https://www.loc.gov/marc/bibliographic/bddirectory.html>
340    fn build_directory(&self, bytes: &mut Vec<u8>) -> usize {
341        let mut num_dirs = 0;
342        let mut prev_end_idx = 0;
343
344        for field in self.control_fields() {
345            num_dirs += 1;
346
347            let mut field_len = field.content().len();
348
349            field_len += 1; // end of field terminator
350
351            // Our directory entry as a string.
352            let s = format!(
353                "{}{:0w1$}{:0w2$}",
354                field.tag(),
355                field_len,
356                prev_end_idx, // our starting point
357                w1 = DATA_LENGTH_SIZE,
358                w2 = DATA_OFFSET_SIZE
359            );
360
361            bytes.append(&mut s.as_bytes().to_vec());
362
363            prev_end_idx += field_len;
364        }
365
366        for field in self.fields() {
367            num_dirs += 1;
368
369            let mut field_len = 3; // ind1 + ind2 + field terminator
370            for sf in field.subfields() {
371                field_len += 2; // sf code + separator
372                field_len += sf.content().len();
373            }
374
375            // Our directory entry as a string.
376            let s = format!(
377                "{}{:0w1$}{:0w2$}",
378                field.tag(),
379                field_len,
380                prev_end_idx, // our starting point
381                w1 = DATA_LENGTH_SIZE,
382                w2 = DATA_OFFSET_SIZE
383            );
384
385            bytes.append(&mut s.as_bytes().to_vec());
386
387            prev_end_idx += field_len;
388        }
389
390        num_dirs
391    }
392
393    /// Appends the binary forms of the control fields and data fields.
394    fn add_data_fields(&self, bytes: &mut Vec<u8>) {
395        // Now append the actual data
396        for field in self.control_fields() {
397            bytes.append(&mut field.content().as_bytes().to_vec());
398            bytes.push(END_OF_FIELD);
399        }
400
401        for field in self.fields() {
402            let s = format!("{}{}", field.ind1(), field.ind2());
403            bytes.append(&mut s.as_bytes().to_vec());
404
405            for sf in field.subfields() {
406                let s = format!("{}{}{}", SUBFIELD_SEPARATOR, sf.code(), sf.content());
407                bytes.append(&mut s.as_bytes().to_vec());
408            }
409
410            bytes.push(END_OF_FIELD);
411        }
412    }
413
414    /// Sync the byte count and data offset values in the leader to
415    /// match the record just created.
416    fn sync_leader(&self, num_dirs: usize, bytes: &mut [u8]) -> Result<(), String> {
417        let blen = bytes.len();
418
419        if blen > MAX_RECORD_BYTES {
420            return Err(format!(
421                "MARC byte count {blen} too large for binary encoding"
422            ));
423        }
424
425        let size_str = format!("{:0w$}", blen, w = RECORD_SIZE_ENTRY);
426        let size_bytes = size_str.as_bytes();
427
428        bytes[0..RECORD_SIZE_ENTRY].copy_from_slice(size_bytes);
429
430        // Set the start index of the body of the record
431        let data_start_idx = LEADER_SIZE + (num_dirs * DIRECTORY_ENTRY_LEN) + 1; // end-of-field
432        let data_start_str = format!("{:0w$}", data_start_idx, w = DATA_OFFSET_SIZE);
433
434        let dstart = DATA_OFFSET_START;
435        let dend = dstart + DATA_OFFSET_SIZE;
436
437        bytes[dstart..dend].copy_from_slice(data_start_str.as_bytes());
438
439        Ok(())
440    }
441}