Parse CSV robustly
This commit is contained in:
parent
f6f44c64ab
commit
461eb8db5f
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -313,6 +313,7 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"console",
|
"console",
|
||||||
|
"csv",
|
||||||
"indicatif",
|
"indicatif",
|
||||||
"nalgebra",
|
"nalgebra",
|
||||||
"prettytable-rs",
|
"prettytable-rs",
|
||||||
|
@ -6,6 +6,7 @@ edition = "2021"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { version = "4.2.1", features = ["derive"] }
|
clap = { version = "4.2.1", features = ["derive"] }
|
||||||
console = "0.15.5"
|
console = "0.15.5"
|
||||||
|
csv = "1.2.1"
|
||||||
indicatif = {version = "0.17.3", features = ["rayon"]}
|
indicatif = {version = "0.17.3", features = ["rayon"]}
|
||||||
nalgebra = "0.32.2"
|
nalgebra = "0.32.2"
|
||||||
prettytable-rs = "0.10.0"
|
prettytable-rs = "0.10.0"
|
||||||
|
@ -16,10 +16,10 @@
|
|||||||
|
|
||||||
const Z_97_5: f64 = 1.959964; // This is the limit of resolution for an f64
|
const Z_97_5: f64 = 1.959964; // This is the limit of resolution for an f64
|
||||||
|
|
||||||
use std::fs;
|
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
use clap::{Args, ValueEnum};
|
use clap::{Args, ValueEnum};
|
||||||
|
use csv::{Reader, StringRecord};
|
||||||
use indicatif::{ParallelProgressIterator, ProgressBar, ProgressDrawTarget, ProgressStyle};
|
use indicatif::{ParallelProgressIterator, ProgressBar, ProgressDrawTarget, ProgressStyle};
|
||||||
use nalgebra::{DMatrix, DVector, Matrix1xX};
|
use nalgebra::{DMatrix, DVector, Matrix1xX};
|
||||||
use prettytable::{Table, format, row};
|
use prettytable::{Table, format, row};
|
||||||
@ -58,46 +58,8 @@ enum OutputFormat {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn main(args: IntCoxArgs) {
|
pub fn main(args: IntCoxArgs) {
|
||||||
let lines: Vec<String>;
|
|
||||||
if args.input == "-" {
|
|
||||||
lines = io::stdin().lines().map(|l| l.unwrap()).collect();
|
|
||||||
} else {
|
|
||||||
let contents = fs::read_to_string(args.input).unwrap();
|
|
||||||
lines = contents.trim_end().split("\n").map(|s| s.to_string()).collect();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read data into matrices
|
|
||||||
|
|
||||||
let mut data_times: DMatrix<f64> = DMatrix::zeros(
|
|
||||||
2, // Left time, right time
|
|
||||||
lines.len() - 1 // Minus 1 row for header row
|
|
||||||
);
|
|
||||||
|
|
||||||
// Called "Z" in the paper and "X" in the C++ code
|
|
||||||
let mut data_indep: DMatrix<f64> = DMatrix::zeros(
|
|
||||||
lines[0].split(",").count() - 2,
|
|
||||||
lines.len() - 1 // Minus 1 row for header row
|
|
||||||
);
|
|
||||||
|
|
||||||
// Read header row
|
|
||||||
let indep_names: Vec<&str> = lines[0].split(",").skip(2).collect();
|
|
||||||
|
|
||||||
// Read data
|
// Read data
|
||||||
// FIXME: Parse CSV more robustly
|
let (indep_names, data_times, data_indep) = read_data(&args.input);
|
||||||
for (i, row) in lines.iter().skip(1).enumerate() {
|
|
||||||
for (j, item) in row.split(",").enumerate() {
|
|
||||||
let value = match item {
|
|
||||||
"inf" => f64::INFINITY,
|
|
||||||
_ => item.parse().expect("Malformed float")
|
|
||||||
};
|
|
||||||
|
|
||||||
if j < 2 {
|
|
||||||
data_times[(j, i)] = value;
|
|
||||||
} else {
|
|
||||||
data_indep[(j - 2, i)] = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fit regression
|
// Fit regression
|
||||||
let progress_bar = ProgressBar::with_draw_target(Some(0), ProgressDrawTarget::term_like(Box::new(UnconditionalTermLike::stderr())));
|
let progress_bar = ProgressBar::with_draw_target(Some(0), ProgressDrawTarget::term_like(Box::new(UnconditionalTermLike::stderr())));
|
||||||
@ -137,6 +99,55 @@ pub fn main(args: IntCoxArgs) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn read_data(path: &str) -> (Vec<String>, DMatrix<f64>, DMatrix<f64>) {
|
||||||
|
// Read CSV into memory
|
||||||
|
let headers: StringRecord;
|
||||||
|
let records: Vec<StringRecord>;
|
||||||
|
if path == "-" {
|
||||||
|
let mut csv_reader = Reader::from_reader(io::stdin());
|
||||||
|
headers = csv_reader.headers().unwrap().clone();
|
||||||
|
records = csv_reader.records().map(|r| r.unwrap()).collect();
|
||||||
|
} else {
|
||||||
|
let mut csv_reader = Reader::from_path(path).unwrap();
|
||||||
|
headers = csv_reader.headers().unwrap().clone();
|
||||||
|
records = csv_reader.records().map(|r| r.unwrap()).collect();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read data into matrices
|
||||||
|
|
||||||
|
let mut data_times: DMatrix<f64> = DMatrix::zeros(
|
||||||
|
2, // Left time, right time
|
||||||
|
records.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Called "Z" in the paper and "X" in the C++ code
|
||||||
|
let mut data_indep: DMatrix<f64> = DMatrix::zeros(
|
||||||
|
headers.len() - 2,
|
||||||
|
records.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Parse header row
|
||||||
|
let indep_names: Vec<String> = headers.iter().skip(2).map(String::from).collect();
|
||||||
|
|
||||||
|
// Parse data
|
||||||
|
for (i, row) in records.iter().enumerate() {
|
||||||
|
for (j, item) in row.iter().enumerate() {
|
||||||
|
let value = match item {
|
||||||
|
"inf" => f64::INFINITY,
|
||||||
|
_ => item.parse().expect("Malformed float")
|
||||||
|
};
|
||||||
|
|
||||||
|
if j < 2 {
|
||||||
|
data_times[(j, i)] = value;
|
||||||
|
} else {
|
||||||
|
data_indep[(j - 2, i)] = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (indep_names, data_times, data_indep);
|
||||||
|
}
|
||||||
|
|
||||||
struct IntervalCensoredCoxData {
|
struct IntervalCensoredCoxData {
|
||||||
data_times: DMatrix<f64>,
|
data_times: DMatrix<f64>,
|
||||||
data_indep: DMatrix<f64>,
|
data_indep: DMatrix<f64>,
|
||||||
|
@ -14,54 +14,20 @@
|
|||||||
// You should have received a copy of the GNU Affero General Public License
|
// You should have received a copy of the GNU Affero General Public License
|
||||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
use std::fs;
|
|
||||||
|
|
||||||
use indicatif::ProgressBar;
|
use indicatif::ProgressBar;
|
||||||
use nalgebra::DMatrix;
|
|
||||||
|
|
||||||
use hpstat::intcox::fit_interval_censored_cox;
|
use hpstat::intcox;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_intcox_zeng_mao_lin() {
|
fn test_intcox_zeng_mao_lin() {
|
||||||
// Compare "Bangkok Metropolitan Administration HIV" data from Zeng, Mao & Lin (2016) with Stata 17 output
|
// Compare "Bangkok Metropolitan Administration HIV" data from Zeng, Mao & Lin (2016) with Stata 17 output
|
||||||
|
|
||||||
let contents = fs::read_to_string("tests/zeng_mao_lin.csv").unwrap();
|
let (_indep_names, data_times, data_indep) = intcox::read_data("tests/zeng_mao_lin.csv");
|
||||||
let lines: Vec<String> = contents.trim_end().split("\n").map(|s| s.to_string()).collect();
|
|
||||||
|
|
||||||
// Read data into matrices
|
|
||||||
|
|
||||||
let mut data_times: DMatrix<f64> = DMatrix::zeros(
|
|
||||||
2, // Left time, right time
|
|
||||||
lines.len() - 1 // Minus 1 row for header row
|
|
||||||
);
|
|
||||||
|
|
||||||
// Called "Z" in the paper and "X" in the C++ code
|
|
||||||
let mut data_indep: DMatrix<f64> = DMatrix::zeros(
|
|
||||||
lines[0].split(",").count() - 2,
|
|
||||||
lines.len() - 1 // Minus 1 row for header row
|
|
||||||
);
|
|
||||||
|
|
||||||
// Read data
|
|
||||||
// FIXME: Parse CSV more robustly
|
|
||||||
for (i, row) in lines.iter().skip(1).enumerate() {
|
|
||||||
for (j, item) in row.split(",").enumerate() {
|
|
||||||
let value = match item {
|
|
||||||
"inf" => f64::INFINITY,
|
|
||||||
_ => item.parse().expect("Malformed float")
|
|
||||||
};
|
|
||||||
|
|
||||||
if j < 2 {
|
|
||||||
data_times[(j, i)] = value;
|
|
||||||
} else {
|
|
||||||
data_indep[(j - 2, i)] = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fit regression
|
// Fit regression
|
||||||
let progress_bar = ProgressBar::hidden();
|
let progress_bar = ProgressBar::hidden();
|
||||||
//let result = fit_interval_censored_cox(data_times, data_indep, 200, 0.00005, false, progress_bar);
|
//let result = fit_interval_censored_cox(data_times, data_indep, 200, 0.00005, false, progress_bar);
|
||||||
let result = fit_interval_censored_cox(data_times, data_indep, 100, 0.0001, false, progress_bar);
|
let result = intcox::fit_interval_censored_cox(data_times, data_indep, 100, 0.0001, false, progress_bar);
|
||||||
|
|
||||||
// import delimited "zeng_mao_lin.csv", case(preserve) numericcols(2)
|
// import delimited "zeng_mao_lin.csv", case(preserve) numericcols(2)
|
||||||
// stintcox Needle Needle2 LogAge GenderM RaceO RaceW GenderM_RaceO GenderM_RaceW, interval(Left_Time Right_Time) full nohr favorspeed lrmodel
|
// stintcox Needle Needle2 LogAge GenderM RaceO RaceW GenderM_RaceO GenderM_RaceW, interval(Left_Time Right_Time) full nohr favorspeed lrmodel
|
||||||
|
Loading…
Reference in New Issue
Block a user