turnbull: Custom CSV implementation
Avoid unnecessary String allocation 13% speedup
This commit is contained in:
parent
b691c5a8d7
commit
8914cf3507
133
src/csv.rs
Normal file
133
src/csv.rs
Normal file
@ -0,0 +1,133 @@
|
||||
// hpstat: High-performance statistics implementations
|
||||
// Copyright © 2023 Lee Yingtong Li (RunasSudo)
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
use std::io::BufRead;
|
||||
|
||||
pub fn read_csv<R: BufRead>(mut reader: R) -> (Vec<String>, Vec<f64>) {
|
||||
// This custom CSV parser is faster than the csv library because we do not waste time allocating Strings for the data which will inevitably be parsed to float anyway
|
||||
|
||||
// Reuse a single buffer to avoid unnecessary allocations
|
||||
// Since we need to make copies only for the headers - the data are directly parsed to float
|
||||
let mut buffer = String::new();
|
||||
|
||||
// Read header
|
||||
let headers = read_row_as_strings(&mut reader, &mut buffer);
|
||||
|
||||
// Read data
|
||||
let mut data = Vec::new();
|
||||
let mut row = Vec::new();
|
||||
loop {
|
||||
if read_row_as_floats(&mut reader, &mut buffer, &mut row) {
|
||||
if row.len() != headers.len() {
|
||||
panic!("Expected row of {} entries, got {} entries", headers.len(), row.len());
|
||||
}
|
||||
|
||||
data.append(&mut row);
|
||||
} else {
|
||||
// EOF
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return (headers, data);
|
||||
}
|
||||
|
||||
fn read_row_as_strings<R: BufRead>(reader: &mut R, buffer: &mut String) -> Vec<String> {
|
||||
buffer.clear();
|
||||
|
||||
let bytes_read = reader.read_line(buffer).expect("IO error");
|
||||
if bytes_read == 0 {
|
||||
panic!("Unexpected EOF");
|
||||
}
|
||||
|
||||
let mut result = Vec::new();
|
||||
let mut entries_iter = buffer.trim().split(',');
|
||||
loop {
|
||||
if let Some(entry) = entries_iter.next() {
|
||||
if entry.starts_with('"') {
|
||||
if entry.ends_with('"') {
|
||||
result.push(String::from(&entry[1..(entry.len() - 1)]));
|
||||
} else {
|
||||
let mut full_entry = String::from(&entry[1..]);
|
||||
|
||||
// Read remainder of quoted entry
|
||||
loop {
|
||||
if let Some(entry_part) = entries_iter.next() {
|
||||
if entry_part.ends_with('"') {
|
||||
// End of quoted entry
|
||||
full_entry.push_str(&entry_part[..(entry_part.len() - 1)]);
|
||||
result.push(full_entry);
|
||||
break;
|
||||
} else {
|
||||
// Middle of quoted entry
|
||||
full_entry.push_str(entry_part);
|
||||
full_entry.push_str(&",");
|
||||
}
|
||||
} else {
|
||||
panic!("Unexpected EOL while reading quoted CSV entry");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result.push(String::from(entry));
|
||||
}
|
||||
} else {
|
||||
// EOL
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
fn read_row_as_floats<R: BufRead>(reader: &mut R, buffer: &mut String, row: &mut Vec<f64>) -> bool {
|
||||
buffer.clear();
|
||||
|
||||
let bytes_read = reader.read_line(buffer).expect("IO error");
|
||||
if bytes_read == 0 {
|
||||
// EOF
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut entries_iter = buffer.trim().split(',');
|
||||
loop {
|
||||
if let Some(entry) = entries_iter.next() {
|
||||
if entry.starts_with('"') {
|
||||
if entry.ends_with('"') {
|
||||
row.push(parse_float(&entry[1..(entry.len() - 1)]));
|
||||
} else {
|
||||
// Float cannot have a comma in it
|
||||
panic!("Malformed float");
|
||||
}
|
||||
} else {
|
||||
row.push(parse_float(entry));
|
||||
}
|
||||
} else {
|
||||
// EOL
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
fn parse_float(s: &str) -> f64 {
|
||||
let value = match s {
|
||||
"inf" => f64::INFINITY,
|
||||
_ => s.parse().expect("Malformed float")
|
||||
};
|
||||
return value;
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
pub mod intcox;
|
||||
pub mod turnbull;
|
||||
|
||||
mod csv;
|
||||
mod pava;
|
||||
mod term;
|
||||
|
@ -17,17 +17,17 @@
|
||||
const Z_97_5: f64 = 1.959964; // This is the limit of resolution for an f64
|
||||
const CHI2_1DF_95: f64 = 3.8414588;
|
||||
|
||||
use core::mem::MaybeUninit;
|
||||
use std::io;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use clap::{Args, ValueEnum};
|
||||
use csv::{Reader, StringRecord};
|
||||
use indicatif::{ParallelProgressIterator, ProgressBar, ProgressDrawTarget, ProgressStyle};
|
||||
use nalgebra::{Const, DMatrix, DVector, Dyn, Matrix2xX};
|
||||
use nalgebra::{DMatrix, DVector, Matrix2xX};
|
||||
use prettytable::{Table, format, row};
|
||||
use rayon::prelude::*;
|
||||
use serde::{Serialize, Deserialize};
|
||||
|
||||
use crate::csv::read_csv;
|
||||
use crate::pava::monotonic_regression_pava;
|
||||
use crate::term::UnconditionalTermLike;
|
||||
|
||||
@ -134,45 +134,21 @@ pub fn main(args: TurnbullArgs) {
|
||||
|
||||
pub fn read_data(path: &str) -> Matrix2xX<f64> {
|
||||
// Read CSV into memory
|
||||
let _headers: StringRecord;
|
||||
let records: Vec<StringRecord>;
|
||||
if path == "-" {
|
||||
let mut csv_reader = Reader::from_reader(io::stdin());
|
||||
_headers = csv_reader.headers().unwrap().clone();
|
||||
records = csv_reader.records().map(|r| r.unwrap()).collect();
|
||||
} else {
|
||||
let mut csv_reader = Reader::from_path(path).unwrap();
|
||||
_headers = csv_reader.headers().unwrap().clone();
|
||||
records = csv_reader.records().map(|r| r.unwrap()).collect();
|
||||
}
|
||||
let (_headers, records) = match path {
|
||||
"-" => read_csv(io::stdin().lock()),
|
||||
_ => read_csv(BufReader::new(File::open(path).expect("IO error")))
|
||||
};
|
||||
|
||||
// Read data into matrices
|
||||
|
||||
// Represent data_times as 2xX rather than Xx2 matrix to allow par_column_iter in code_times_as_indexes (no par_row_iter)
|
||||
let mut data_times: Matrix2xX<MaybeUninit<f64>> = Matrix2xX::uninit(
|
||||
Const::<2>, // Left time, right time
|
||||
Dyn(records.len())
|
||||
);
|
||||
|
||||
// Parse data
|
||||
for (i, row) in records.iter().enumerate() {
|
||||
for (j, item) in row.iter().enumerate() {
|
||||
let value = match item {
|
||||
"inf" => f64::INFINITY,
|
||||
_ => item.parse().expect("Malformed float")
|
||||
};
|
||||
|
||||
data_times[(j, i)].write(value);
|
||||
}
|
||||
}
|
||||
// Serendipitously, from_vec fills column-by-column
|
||||
let data_times = Matrix2xX::from_vec(records);
|
||||
|
||||
// TODO: Fail on left time > right time
|
||||
// TODO: Fail on left time < 0
|
||||
|
||||
// SAFETY: assume_init is OK because we initialised all values above
|
||||
unsafe {
|
||||
return data_times.assume_init();
|
||||
}
|
||||
return data_times;
|
||||
}
|
||||
|
||||
struct TurnbullData {
|
||||
|
Loading…
Reference in New Issue
Block a user