/* * Group Split * * This program splits a delimeted file into multiple files, based on * the value of the specified column. Rows that share the same value are * concatenated into the same file. * * --------------------------------------------------------------------- * * This file is part of Group Split. * * Group Split is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Group Split is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Foobar. If not, see . * */ use std::io::{self, BufRead}; use std::fs::File; use std::io::Write; fn write_group(group: &Vec>, delimeter: &str, file: &std::fs::File) { let mut file = file; for split in group { let _ = file.write_all((split.join(delimeter) + "\n").as_bytes()); } } fn main() { // For this initial version, we are going to define a static delimeter // character. This should be specified by the user via command line // argument in future versions. const DELIMETER: &'static str = "\t"; // We are also going to assume the input file has a header. const HAS_HEADER: bool = true; // We are also going to assume the first column is the group // identifier. const GROUP_IDENTIFIER: usize = 0; // We are also going to assume the number of files to be created. It // is important to note that if this number ever exceeds the number of // groups in the file, we will have no choice but to use the maximum // number of groups that is possible. const NUM_OUTPUT_FILES: u64 = 20; // Set up our file write descriptors let mut file_descriptors: Vec = vec!(); for i in 0..NUM_OUTPUT_FILES { let w = (NUM_OUTPUT_FILES as f64).ln().floor() as usize; let file = File::create(format!("{:0width$}", i, width = w).to_string()).unwrap(); file_descriptors.push(file); } // With those program options out of the way, we can start reading the // file into our program. The file should be piped into stdin. let stdin = io::stdin(); let mut header: Vec = vec!(); let mut current_group_id: String = "\0".to_string(); let mut current_group: Vec> = vec!(); let mut groups_processed: u64 = 0; let mut file_index: u64 = 0; for line in stdin.lock().lines() { let unwrapped: &str = &line.unwrap(); let split: Vec<&str> = unwrapped.split(DELIMETER).collect(); // Skip blank lines if "" == unwrapped { continue; } // Grab the header if necessary if header.is_empty() && HAS_HEADER { header = split.iter().map(|&v| v.to_string()).collect::>(); continue; } // First group bootstrap procedure if "\0" == current_group_id { current_group_id = split[GROUP_IDENTIFIER].clone().to_string(); current_group.push(header.clone()); } // Does this line belong to the group we just processed? If it // doesn't, we're going to stop what we're doing and write out the // previous group. Otherwise, we'll just be adding this line to the // current group. if ¤t_group_id != &split[GROUP_IDENTIFIER].to_string() { write_group(¤t_group, DELIMETER, &file_descriptors[file_index as usize]); groups_processed += 1; file_index = groups_processed % NUM_OUTPUT_FILES; // Clear out the old group, make the new one current_group_id = split[GROUP_IDENTIFIER].clone().to_string(); current_group = vec!(); // If this is the first group of this file's kind, it's going to // need a header. if groups_processed < NUM_OUTPUT_FILES { current_group.push(header.clone()); } } else { let row_to_add = split.iter().map(|&x| x.to_string()).collect::>(); current_group.push(row_to_add); } } write_group(¤t_group, DELIMETER, &file_descriptors[file_index as usize]); }