From 8914cf35078500697f42d5f2522d1bd1f239acaa Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Sat, 11 Nov 2023 00:25:19 +1100 Subject: [PATCH] turnbull: Custom CSV implementation Avoid unnecessary String allocation 13% speedup --- src/csv.rs | 133 ++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/turnbull.rs | 46 ++++------------- 3 files changed, 145 insertions(+), 35 deletions(-) create mode 100644 src/csv.rs diff --git a/src/csv.rs b/src/csv.rs new file mode 100644 index 0000000..9885351 --- /dev/null +++ b/src/csv.rs @@ -0,0 +1,133 @@ +// hpstat: High-performance statistics implementations +// Copyright © 2023 Lee Yingtong Li (RunasSudo) +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use std::io::BufRead; + +pub fn read_csv(mut reader: R) -> (Vec, Vec) { + // This custom CSV parser is faster than the csv library because we do not waste time allocating Strings for the data which will inevitably be parsed to float anyway + + // Reuse a single buffer to avoid unnecessary allocations + // Since we need to make copies only for the headers - the data are directly parsed to float + let mut buffer = String::new(); + + // Read header + let headers = read_row_as_strings(&mut reader, &mut buffer); + + // Read data + let mut data = Vec::new(); + let mut row = Vec::new(); + loop { + if read_row_as_floats(&mut reader, &mut buffer, &mut row) { + if row.len() != headers.len() { + panic!("Expected row of {} entries, got {} entries", headers.len(), row.len()); + } + + data.append(&mut row); + } else { + // EOF + break; + } + } + + return (headers, data); +} + +fn read_row_as_strings(reader: &mut R, buffer: &mut String) -> Vec { + buffer.clear(); + + let bytes_read = reader.read_line(buffer).expect("IO error"); + if bytes_read == 0 { + panic!("Unexpected EOF"); + } + + let mut result = Vec::new(); + let mut entries_iter = buffer.trim().split(','); + loop { + if let Some(entry) = entries_iter.next() { + if entry.starts_with('"') { + if entry.ends_with('"') { + result.push(String::from(&entry[1..(entry.len() - 1)])); + } else { + let mut full_entry = String::from(&entry[1..]); + + // Read remainder of quoted entry + loop { + if let Some(entry_part) = entries_iter.next() { + if entry_part.ends_with('"') { + // End of quoted entry + full_entry.push_str(&entry_part[..(entry_part.len() - 1)]); + result.push(full_entry); + break; + } else { + // Middle of quoted entry + full_entry.push_str(entry_part); + full_entry.push_str(&","); + } + } else { + panic!("Unexpected EOL while reading quoted CSV entry"); + } + } + } + } else { + result.push(String::from(entry)); + } + } else { + // EOL + break; + } + } + + return result; +} + +fn read_row_as_floats(reader: &mut R, buffer: &mut String, row: &mut Vec) -> bool { + buffer.clear(); + + let bytes_read = reader.read_line(buffer).expect("IO error"); + if bytes_read == 0 { + // EOF + return false; + } + + let mut entries_iter = buffer.trim().split(','); + loop { + if let Some(entry) = entries_iter.next() { + if entry.starts_with('"') { + if entry.ends_with('"') { + row.push(parse_float(&entry[1..(entry.len() - 1)])); + } else { + // Float cannot have a comma in it + panic!("Malformed float"); + } + } else { + row.push(parse_float(entry)); + } + } else { + // EOL + break; + } + } + + return true; +} + +fn parse_float(s: &str) -> f64 { + let value = match s { + "inf" => f64::INFINITY, + _ => s.parse().expect("Malformed float") + }; + return value; +} diff --git a/src/lib.rs b/src/lib.rs index 73933ed..11e3925 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ pub mod intcox; pub mod turnbull; +mod csv; mod pava; mod term; diff --git a/src/turnbull.rs b/src/turnbull.rs index 0419e07..eb093db 100644 --- a/src/turnbull.rs +++ b/src/turnbull.rs @@ -17,17 +17,17 @@ const Z_97_5: f64 = 1.959964; // This is the limit of resolution for an f64 const CHI2_1DF_95: f64 = 3.8414588; -use core::mem::MaybeUninit; -use std::io; +use std::fs::File; +use std::io::{self, BufReader}; use clap::{Args, ValueEnum}; -use csv::{Reader, StringRecord}; use indicatif::{ParallelProgressIterator, ProgressBar, ProgressDrawTarget, ProgressStyle}; -use nalgebra::{Const, DMatrix, DVector, Dyn, Matrix2xX}; +use nalgebra::{DMatrix, DVector, Matrix2xX}; use prettytable::{Table, format, row}; use rayon::prelude::*; use serde::{Serialize, Deserialize}; +use crate::csv::read_csv; use crate::pava::monotonic_regression_pava; use crate::term::UnconditionalTermLike; @@ -134,45 +134,21 @@ pub fn main(args: TurnbullArgs) { pub fn read_data(path: &str) -> Matrix2xX { // Read CSV into memory - let _headers: StringRecord; - let records: Vec; - if path == "-" { - let mut csv_reader = Reader::from_reader(io::stdin()); - _headers = csv_reader.headers().unwrap().clone(); - records = csv_reader.records().map(|r| r.unwrap()).collect(); - } else { - let mut csv_reader = Reader::from_path(path).unwrap(); - _headers = csv_reader.headers().unwrap().clone(); - records = csv_reader.records().map(|r| r.unwrap()).collect(); - } + let (_headers, records) = match path { + "-" => read_csv(io::stdin().lock()), + _ => read_csv(BufReader::new(File::open(path).expect("IO error"))) + }; // Read data into matrices // Represent data_times as 2xX rather than Xx2 matrix to allow par_column_iter in code_times_as_indexes (no par_row_iter) - let mut data_times: Matrix2xX> = Matrix2xX::uninit( - Const::<2>, // Left time, right time - Dyn(records.len()) - ); - - // Parse data - for (i, row) in records.iter().enumerate() { - for (j, item) in row.iter().enumerate() { - let value = match item { - "inf" => f64::INFINITY, - _ => item.parse().expect("Malformed float") - }; - - data_times[(j, i)].write(value); - } - } + // Serendipitously, from_vec fills column-by-column + let data_times = Matrix2xX::from_vec(records); // TODO: Fail on left time > right time // TODO: Fail on left time < 0 - // SAFETY: assume_init is OK because we initialised all values above - unsafe { - return data_times.assume_init(); - } + return data_times; } struct TurnbullData {