commit 26c5433d16fd3a8dcb9a420bb40c47b45f80bd4c Author: M. George Hansen Date: Wed Feb 6 12:24:39 2019 -0800 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a9d37c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..a91e460 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "csv-sanity" +version = "0.1.0" +authors = ["M. George Hansen "] +license = "MPL-2.0" +maintenance = { status = "passively-maintained" } + +[dependencies] +csv = "0.15.0" +clap = "2.23.3" +log = "0.3.7" +regex = "0.2.1" +lazy_static = "0.2.8" +unicode-segmentation = "1.1.0" +time = "0.1.37" +maplit = "0.1.4" +serde = "1.0" +serde_derive = "1.0" +serde_json = "1.0" +custom_derive = "0.1.7" +newtype_derive = "0.1.6" +rustc-serialize = "0.3" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a612ad9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/README.md b/README.md new file mode 100644 index 0000000..29fdb9e --- /dev/null +++ b/README.md @@ -0,0 +1,268 @@ +# csv-sanity + +Preserve your sanity is a world full of malformed, poorly validated CSV files. +Sanitize and transform large CSVs with millions of records quickly and +efficiently. + +**NOTE:** csv-sanity is in an alpha state and is subject to breaking changes. +The ruleset file syntax in particular is likely to change in the near future. +I've personally used csv-sanity on a number of projects and it has been +incredibly helpful, but as with most alpha software csv-sanity is provided +as-is and provides no warranty or guarantee. Use at your own risk and double +check your transformed files! + +## Purpose + +The CSV format is not well-standardized and has many shortfalls when it comes to +storing large numbers of records with complex data formats, but CSVs are +ubiquitous in many realms as a neutral interchange format that most CRMs and +database software can parse and understand. + +But what happens when your CRM can only parse ISO 8601 formatted dates and the +CSV you inherited has dates in another format such as the following: + +```csv +id,name,signup_date +2,John Doe,11/22/2017 +3,Jane Doe,11/28/2017 +``` + +Or you received a CSV of people who you need to contact via a personalized +email, but your contacts' names in the CSV are in ALL CAPS: + +```csv +id,first_name,last_name +2,JOHN,DOE +3,JANE,DOE +``` + +Or you have a CSV that has valid values for the vast majority of records, but 1 +out of every 20k records has nonsense values that cause your entire import to +abort: + +```csv +id,fist_name,last_name,party_registration +2,Jane,Doe,REP +3,John,Doe,DEM +345,Josh,Smith,HAHAHAHA +``` + +Or even a CSV that has a few malformed records due to unescaped commas: + +```csv +id,first_name,last_name,email +2,Jane,Doe,jane@example.com +3,John,Doe,"i,don't,follow,the,rules"@example.com +``` + +These are all real problems I've encountered with CSVs over the years. If the +CSV is small enough they can be corrected by hand, but for CSVs with 10k, 100k +or even millions of records correcting by hand simply isn't a viable option. + +`csv-sanity` aims to solve the issue of sanitizing large, poorly-validated CSVs. + +## Usage + +`csv-sanity` is an executable that takes an input CSV to process and a JSON +ruleset file defining the transformation rules to apply: + +```bash +csv-sanity [-r RULESET_FILE] +``` + +If a path to a ruleset file is not provided via the `-r` option, `csv-sanity` +will look for a file named "ruleset.json" in the current directory. + +By default, `csv-sanity` outputs two files to the current directory: +output.csv, which contains the processed CSV with validated and transformed +records, and errors.csv, which contains a list of records and fields that +couldn't be processed and reasons they were rejected. The paths where the output +and error files are output can be overridden via the `-o FILE_PATH` and +`-e FILE_PATH` options, respectively. + +## ruleset.json Syntax + +Ruleset files are JSON files that define a collection of transformation rules +and the fields to which they should be applied. + +The following is an example ruleset JSON file: + +```json +{ + "rules": [ + { + "applicability": { + "Global": [], + }, + "transformer": { + "None": { + "regex": "\\A(?:[:cntrl:]|\\s)*\\z" + } + }, + "priority": -10 + }, + { + "applicability": { + "Global": [], + }, + "transformer": { + "Trim": {} + }, + "priority": -10 + }, + { + "applicability": { + "Fields": { + "field_names": [ + "first_name", + "last_name" + ] + } + }, + "transformer": { + "Capitalize": {} + } + } + ] +} +``` + +Every ruleset.json file is a JSON object with a single "rules" field with an +array of rule objects. + +Rules are objects with two fields: + +- **"applicability"**: specifies whether a rule applies globally or only to a + predefined set of fields (specified as the column headers in the CSV being + processed) +- **"transformer"**: a transformer object, which specifies how the applicable + fields should be transformed. + +### Transformers + + + +#### Capitalize + +Transforms string fields into Capital Case. + +#### Choice + +Only accepts a pre-defined list of acceptable values and rejects the rest. + +#### Date + +```json +{ + "Date": { + "input_formats": [ + "%m/%d/%Y" + ], + "output_formats": "%F" + } +} +``` + +Attempt to parse fields with a list of datetime formats via +[time::strptime](https://docs.rs/time/0.1.37/time/fn.strptime.html). See the +docs for the [time](https://docs.rs/time/0.1.37/time/index.html) crate for +details on datetime formating syntax. + +#### Email + +```json +{ + "Email": {} +} +``` + +Attempt to parse fields as email addresses, rejecting any fields that appear to +be invalid email addresses. + +#### None + +```json +{ + "None": { + "regex": "\\A(?:[:cntrl:]|\\s)*\\z" + } +} +``` + +Replace matched fields with a blank value. Useful as a global rule for +normalizing blank fields in a CSV file. + +#### Number + +```json +{ + "Number": {} +} +``` + +Attempt to parse fields as whole integers, rejecting any fields that cannot be +parsed. + +#### PhoneNumber + +```json +{ + "PhoneNumber": {} +} +``` + +Attempt to parse files as US, NANP-formatted phone numbers, transforming them +into a standard international format of `+1 `. + +#### Regex + +```json +{ + "Regex": { + "regex": "\\A([A-Z])[A-Z]+\\z", + "template": "$1" + } +} +``` + +Match fields against the provided regex pattern and transform them according to +the template string, replacing capture groups placeholders. See the +[Regex::replace](https://docs.rs/regex/0.2.1/regex/struct.Regex.html#method.replace) +in the regex crate docs for details. + +#### RegexMatch + +```json +{ + "RegexMatch": { + "regex": "\\A[A-Z]{2,3}\\z", + "negate": false + } +} +``` + +Reject any fields that fail to match against the provided regex pattern. If +`negate` is `true`, the reject any fields that match the provided regex pattern +instead. + +#### Trim + +```json +{ + "Trim": {} +} +``` + +Trim leading and trailing whitespace from fields. Useful as a global rule to +normalize fields and remove useless whitespace. + +#### Zipcode + +```json +{ + "Zipcode": {} +} +``` + +Attempt to parse fields as US zip codes in the formats "xxxxx" and "xxxxx-xxxx", +rejecting any fields that fail to match that format. diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..f90d938 --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,193 @@ +//! Command line interface. + +use std::fs::File; +use std::path::Path; + +use { + Ruleset, + TransformError, + TransformedRecord, +}; + +use csv; + +/// Configuration options for the `Cli`. +pub struct Options +{ + /// See `CsvOptions`. + pub csv_options: CsvOptions, +} + +impl Default for Options { + fn default() -> Options { + Options { + csv_options: Default::default(), + } + } +} + +/// `Cli` configuration options specific to how to parse the CSV file. +/// +/// `CsvOptions` implements `Default` with the following defaults: +/// +/// ``` +/// extern crate csv; +/// use csv_sanity::cli::CsvOptions; +/// use csv::RecordTerminator; +/// +/// let defaults = CsvOptions { +/// delimiter: b',', +/// record_terminator: csv::RecordTerminator::CRLF, +/// quote: b'"', +/// escape: None, +/// double_quote: true, +/// }; +/// assert_eq!(defaults, Default::default()); +/// ``` +pub struct CsvOptions +{ + /// Field delimeter to expect in the CSV file. + /// + /// Corresponds to the `csv::Reader.delimiter` method. + pub delimiter: u8, + /// Record terminator to expect in the CSV file. + /// + /// Corresponds to the `csv::Reader.record_terminator` method. See `csv::RecordTerminator`. + pub record_terminator: csv::RecordTerminator, + /// Field quotation character to expect in the CSV file. + /// + /// Corresponds to the `csv::Reader.quote` method. + pub quote: u8, + /// Escape character to expect in the CSV file. + /// + /// Corresponds to the `csv::Reader.escape` method. + pub escape: Option, + /// Whether two adjacent quote characters should be interpreted as an escaped quote character. + /// + /// Corresponds to the `csv::Reader.double_quote` method. + pub double_quote: bool +} + +impl Default for CsvOptions +{ + fn default() -> CsvOptions { + CsvOptions { + delimiter: b',', + record_terminator: csv::RecordTerminator::CRLF, + quote: b'"', + escape: None, + double_quote: true, + } + } +} + +/// Command line interface for running a `Ruleset` against a CSV file. +pub struct Cli +{ + options: Options, + ruleset: Ruleset, +} + +impl Cli +{ + /// Construct a new `Cli` with default options. + /// + /// ``` + /// use csv_sanity::Ruleset; + /// use csv_sanity::cli::{ + /// Cli + /// }; + /// + /// let ruleset = Ruleset::new(); + /// let cli = Cli::new(ruleset); + /// ``` + pub fn new(ruleset: Ruleset) -> Cli { + Self::new_with_options(ruleset, Default::default()) + } + + /// Construct a new `Cli` with the specified options. + /// + /// ``` + /// use csv_sanity::Ruleset; + /// use csv_sanity::cli::{ + /// Cli, + /// Options, + /// CsvOptions + /// }; + /// + /// let ruleset = Ruleset::new(); + /// let cli = Cli::new_with_options(ruleset, Options { + /// csv_options: CsvOptions { + /// delimiter: b',', + /// .. Default::default() + /// }, + /// .. Default::default() + /// }); + /// ``` + pub fn new_with_options(ruleset: Ruleset, options: Options) -> Cli { + Cli { + options: options, + ruleset: ruleset, + } + } + + pub fn run, O: AsRef, E: AsRef>(&self, input_file_path: I, output_file_name: O, error_file_name: E) { + let (mut reader, headers) = self.reader_from_file(input_file_path); + + let mut output_writer = csv::Writer::from_file(output_file_name).expect("Unable to open output file for writing"); + let mut output_headers = headers.clone(); + output_headers.insert(0, "Record Number".to_string()); + output_writer.encode(output_headers).expect("Unable to write to output file"); + + let mut error_writer = csv::Writer::from_file(error_file_name).expect("Unable to open error file for writing"); + let error_headers = vec![ + "Record Number", + "Field Name", + "Field Value", + "Reason", + ]; + error_writer.encode(error_headers).expect("Unable to write to error file"); + + for (record_n, record) in reader.records().enumerate() { + let original_line_n = record_n + 2; // Plus one for headers and plus one for zero-indexing. + let transformed_record: TransformedRecord = match record { + Err(e) => { + let err = TransformError { + field_value: "".to_string(), + field_name: "".to_string(), + record_n: original_line_n, + reason: format!("{}", e), + }; + error_writer.encode(err).expect("Unable to write to error file"); + continue; + }, + Ok(ref rec) => self.ruleset.apply_rules(&headers, rec, original_line_n) + }; + let record_fields: Vec> = { + let mut fs = vec![Some(original_line_n.to_string())]; + fs.extend(transformed_record.field_values); + fs + }; + output_writer.encode(record_fields).expect("Unable to write to output file"); + for error in transformed_record.errors { + error_writer.encode(error).expect("Unable to write to error file"); + } + } + } + + fn reader_from_file>(&self, path: P) -> (csv::Reader, Vec) { + let mut reader = csv::Reader::from_file(path.as_ref().clone()).map(|r| { + // Configure the reader according to the options passed to the Cli constructor. + r.has_headers(true) + .delimiter(self.options.csv_options.delimiter) + .record_terminator(self.options.csv_options.record_terminator) + .quote(self.options.csv_options.quote) + .escape(self.options.csv_options.escape) + .double_quote(self.options.csv_options.double_quote) + .flexible(true) + }).expect(&format!("Unable to read file {}", path.as_ref().display())); + let headers = reader.headers() + .expect(&format!("Unable to read headers from input file {}", path.as_ref().display())); + (reader, headers) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..c6c937b --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,36 @@ +#[macro_use] +extern crate serde_derive; +extern crate serde; +extern crate serde_json; +extern crate regex; +#[macro_use] +extern crate lazy_static; +extern crate unicode_segmentation; +extern crate time; +extern crate csv; +#[macro_use] +extern crate custom_derive; +#[macro_use] +extern crate newtype_derive; +extern crate rustc_serialize; + +mod newtypes; + +pub mod transformer; +pub use transformer::{ + Transformer, + TransformResult, + TransformResultHelper, + TransformError +}; + +pub mod transformers; + +mod ruleset; +pub use ruleset::{ + Rule, + Ruleset, + TransformedRecord, +}; + +pub mod cli; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..9286016 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,111 @@ +extern crate csv_sanity; + +extern crate serde_json; +#[macro_use] +extern crate log; +extern crate regex; +#[macro_use] +extern crate clap; + +use csv_sanity::cli::{ + self, + Cli, +}; + +use std::fs::File; +use std::path::Path; +use log::{ + LogRecord, + LogLevel, + LogMetadata, + LogLevelFilter, + SetLoggerError +}; +use clap::{ + App, + Arg +}; + +struct ConsoleLogger { + log_level: LogLevel +} + +impl log::Log for ConsoleLogger { + fn enabled(&self, metadata: &LogMetadata) -> bool { + metadata.level() <= self.log_level + } + + fn log(&self, record: &LogRecord) { + if self.enabled(record.metadata()) { + println!("{} - {}", record.level(), record.args()) + } + } +} + +fn init_logging() -> Result<(), SetLoggerError> { + log::set_logger(|max_log_level| { + max_log_level.set(LogLevelFilter::Info); + Box::new(ConsoleLogger { log_level: LogLevel::Info }) + }) +} + +fn main() { + init_logging().unwrap(); + + let matches = App::new("Convert CSV") + .version(crate_version!()) + .author("M. George Hansen ") + .about("Apply a set of transformations to the records in a CSV file, attempting to read a much valid information from the file as possible.") + .arg(Arg::with_name("INPUT_FILE") + .help("CSV file to process") + .required(true) + .index(1)) + .arg(Arg::with_name("output") + .help("File to output the transformed CSV records. Defaults to ./output.csv") + .short("o") + .long("output") + .takes_value(true)) + .arg(Arg::with_name("error_output") + .help("File to output errors in CSV format. Defaults to ./errors.csv") + .short("e") + .long("error_output") + .takes_value(true)) + .arg(Arg::with_name("ruleset") + .help("JSON file containing the ruleset to apply. Defaults to ./ruleset.json") + .short("r") + .long("ruleset") + .takes_value(true)) + .get_matches(); + + let ruleset_file_path = Path::new(matches.value_of("ruleset").unwrap_or("ruleset.json")); + let ruleset_file = match File::open(ruleset_file_path) { + Ok(f) => f, + Err(e) => exit_with_error(&format!("unable to read ruleset file {}: {}", ruleset_file_path.display(), e)) + }; + let ruleset = match serde_json::from_reader(ruleset_file) { + Ok(r) => r, + Err(e) => { + exit_with_error(&format!("failed to parse ruleset from {}: {}", ruleset_file_path.display(), e)); + } + }; + + let cli_app = Cli::new_with_options(ruleset, cli::Options { + csv_options: cli::CsvOptions { + delimiter: b'\t', + .. Default::default() + }, + .. Default::default() + }); + + // NOTE: Required arguments are validated by clap, so we should be safe to use expect here. + let input_file_name = matches.value_of("INPUT_FILE").expect("INPUT_FILE argument could not be found!"); + let output_file_name = matches.value_of("output_file").unwrap_or("output.csv"); + let error_file_name = matches.value_of("error_file").unwrap_or("errors.csv"); + cli_app.run(input_file_name, output_file_name, error_file_name); +} + +fn exit_with_error(error_msg: &str) -> ! +{ + error!("{}", error_msg); + std::process::exit(1); +} diff --git a/src/newtypes.rs b/src/newtypes.rs new file mode 100644 index 0000000..87d89e9 --- /dev/null +++ b/src/newtypes.rs @@ -0,0 +1,59 @@ +use std::hash::{ + Hash, + Hasher, +}; +use regex; +use serde::{ + Serialize, + Serializer, + Deserialize, + Deserializer, +}; + +custom_derive! { + #[derive(NewtypeFrom, NewtypeDeref, NewtypeDerefMut, Clone, NewtypeDisplay, NewtypeDebug)] + pub struct Regex(regex::Regex); +} + +impl PartialEq for Regex { + fn eq(&self, other: &Regex) -> bool + { + self.0.as_str() == other.0.as_str() + } +} + +impl Eq for Regex {} + +impl Hash for Regex { + fn hash(&self, state: &mut H) + where H: Hasher { + self.as_str().hash(state); + } +} + +impl Serialize for Regex +{ + fn serialize(&self, serializer: S) -> Result + where S: Serializer { + let Regex(ref regex) = *self; + regex.as_str().serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for Regex +{ + fn deserialize(deserializer: D) -> Result + where D: Deserializer<'de> + { + use serde::de::{Unexpected, Error}; + let string: Result = Deserialize::deserialize(deserializer); + string.and_then(|s| { + regex::Regex::new(&s) + .map(|r| Regex(r)) + .map_err(|e| { + let message: &str = &format!("invalid regex string: {}", e); + D::Error::invalid_value(Unexpected::Str(&s), &message) + }) + }) + } +} diff --git a/src/ruleset.rs b/src/ruleset.rs new file mode 100644 index 0000000..179d3ad --- /dev/null +++ b/src/ruleset.rs @@ -0,0 +1,345 @@ +use Transformer; +use transformer::{ + TransformResult, + TransformError, +}; +use transformers::{ + Transformers, + TrimTransformer, + NoneTransformer, +}; + +use std::hash::{ + Hash, + Hasher, +}; +use std::iter::FromIterator; +use std::cmp::Ordering; +use std::collections::{ + BinaryHeap, + HashSet, +}; +use std::error; +use std::fmt::{ + self, + Formatter, + Display, +}; + +/// Applicability of a `Rule` determining which CSV record's fields it can be applied to. +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +pub enum Applicability { + /// Applicable to all CSV record fields. + Global, + /// Applicable to a subset of a CSV record's fields, specified by field name. + Fields { + field_names: HashSet + } +} + +impl Hash for Applicability { + fn hash(&self, state: &mut H) + where H: Hasher { + use self::Applicability::*; + match *self { + Global => (self as *const Applicability).hash(state), // FIXME: Is this the correct way to hash an empty enum variant? + Fields { ref field_names } => field_names.iter().collect::>().hash(state) + } + } +} + +fn priority_is_default(priority: &isize) -> bool { + priority == &0 +} + +/// A `Transformer` paired with `Applicability` and a priority which can be applied to fields in a +/// CSV record. +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct Rule +{ + applicability: Applicability, + transformer: Transformers, + #[serde(default, skip_serializing_if="priority_is_default")] + priority: isize +} + +impl Rule +{ + /// Construct a new `Rule` whoe `Transformer` is applicable to one or more CSV record's fields + /// referenced by name with the default priority of 0. + /// + /// # Examples + /// ``` + /// use csv_sanity::Rule; + /// use csv_sanity::transformers::*; + /// + /// let rule = Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize( + /// CapitalizeTransformer::new() + /// )); + /// ``` + pub fn for_fields(field_names: &[&str], transformer: Transformers) -> Rule { + Self::for_fields_with_priority(field_names, transformer, Default::default()) + } + + /// Construct a new `Rule` whoe `Transformer` is applicable to one or more CSV record's fields + /// referenced by name with the specified priority. + /// + /// # Examples + /// ``` + /// use csv_sanity::Rule; + /// use csv_sanity::transformers::*; + /// + /// let rule = Rule::for_fields_with_priority(&["Fist Name", "Last Name"], Transformers::Capitalize( + /// CapitalizeTransformer::new() + /// ), 10); + /// ``` + pub fn for_fields_with_priority(field_names: &[&str], transformer: Transformers, priority: isize) -> Rule { + Rule { + applicability: Applicability::Fields { field_names: field_names.iter().map(|s| s.to_string()).collect() }, + transformer: transformer, + priority: priority + } + } + + /// Construct a new `Rule` applicable to all of a CSV record's fields with the default priority + /// of 0. + /// + /// # Examples + /// ``` + /// use csv_sanity::Rule; + /// use csv_sanity::transformers::*; + /// + /// let rule = Rule::global(Transformers::Capitalize( + /// CapitalizeTransformer::new() + /// )); + /// ``` + pub fn global(transformer: Transformers) -> Rule { + Self::global_with_priority(transformer, Default::default()) + } + + /// Construct a new `Rule` applicable to all of a CSV record's fields with the specified + /// priority. + /// + /// # Examples + /// ``` + /// use csv_sanity::Rule; + /// use csv_sanity::transformers::*; + /// + /// let rule = Rule::global_with_priority(Transformers::Capitalize( + /// CapitalizeTransformer::new() + /// ), 10); + /// ``` + pub fn global_with_priority(transformer: Transformers, priority: isize) -> Rule { + Rule { + applicability: Applicability::Global, + transformer: transformer, + priority: priority + } + } + + /// Apply this rule to a CSV record's field, returning the resulting `TransformResult`. + /// + /// # Examples + /// ``` + /// use csv_sanity::Rule; + /// use csv_sanity::transformers::*; + /// + /// let field = "JOHN"; + /// let field_name = "First Name"; + /// + /// let rule = Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize( + /// CapitalizeTransformer::new() + /// )); + /// rule.apply(field, field_name, 1); + /// ``` + pub fn apply(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult { + // XXX: Does the applicability check belong inside the apply method? Or should the caller + // decide? + match self.applicability { + Applicability::Global => self.transformer.transform(field_value, field_name, record_n), + Applicability::Fields { ref field_names } if field_names.contains(&field_name.to_string()) => { + self.transformer.transform(field_value, field_name, record_n) + }, + _ => Ok(Some(field_value.to_string())) + } + } +} + +impl Ord for Rule +{ + fn cmp(&self, other: &Self) -> Ordering { + other.priority.cmp(&self.priority) + } +} + +impl PartialOrd for Rule +{ + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +/// An ordered set of `Rule`s sorted by priority. +/// +/// # Examples +/// ``` +/// use csv_sanity::{ +/// Ruleset, +/// Rule, +/// TransformedRecord, +/// }; +/// use csv_sanity::transformers::*; +/// let ruleset = { +/// let mut r = Ruleset::new(); +/// r.add_rule(Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize( +/// CapitalizeTransformer::new() +/// ))); +/// r.add_rule(Rule::for_fields(&["Email"], Transformers::Email( +/// EmailTransformer::new() +/// ))); +/// r +/// }; +/// let headers = vec!["Id", "First Name", "Last Name", "Email"].iter().map(|s| s.to_string()).collect(); +/// let record = vec!["1", " JOHN", "SNOW ", "\t JSNOW@EXAMPLE.COM "].iter().map(|s| s.to_string()).collect(); +/// let transformed_record = ruleset.apply_rules(&headers, &record, 1); +/// assert_eq!(TransformedRecord { +/// field_values: vec!["1", "John", "Snow", "jsnow@example.com"].iter().map(|s| Some(s.to_string())).collect(), +/// errors: Vec::new(), +/// }, transformed_record); +/// ``` +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct Ruleset { + rules: BinaryHeap +} + +impl Ruleset { + /// Construct a new `Ruleset` with a default `NoneTransformer` and `TrimTransformer` global + /// rules. + /// + /// The default trim and none rules should be appropriate for most CSV files. For CSV files + /// where these default rules are not desired use the `Ruleset::without_default_rules` method. + pub fn new() -> Ruleset { + let mut ruleset = Self::without_default_rules(); + // Add a default trim rule and blank rule to match empty fields. + ruleset.add_rule(Rule::global_with_priority(Transformers::None(NoneTransformer::with_blank_matcher()), -10)); + ruleset.add_rule(Rule::global_with_priority(Transformers::Trim(TrimTransformer::new()), -10)); + ruleset + } + + /// Construct a new `Ruleset` without any of the default rules. + pub fn without_default_rules() -> Ruleset { + Ruleset { + rules: BinaryHeap::new() + } + } + + /// Add a `Rule` to the this ruleset. + pub fn add_rule(&mut self, rule: Rule) { + self.rules.push(rule); + } + + /// Validate this ruleset against a CSV file by comparing it's `Rule`s against the headers. + pub fn validate_rules(&self, headers: &Vec) -> Result<(), Vec> { + let mut errors = Vec::new(); + for rule in self.rules.iter() { + if let Applicability::Fields { ref field_names } = rule.applicability { + let header_set = HashSet::::from_iter(headers.clone()); + let field_set = HashSet::::from_iter(field_names.clone()); + let diff: HashSet = field_set.difference(&header_set).cloned().collect(); + if diff.len() > 0 { + // FIXME: We should have a better way to construct a ruleset that uses Result + // instead of panic! here. + errors.push( + ValidationError { + reason: format!("The following fields were not found in headers: '{:?}'", diff), + } + ) + } + } + } + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } + } + + /// Apply this `Ruleset` to a record from a CSV file. + pub fn apply_rules(&self, headers: &Vec, fields: &Vec, record_n: usize) -> TransformedRecord { + let expected_n_fields = headers.len(); + + let mut errors: Vec = Vec::new(); + let mut transformed_fields: Vec> = Vec::new(); + for (field_n, field_value) in fields.iter().enumerate() { + if field_n < expected_n_fields { + let field_name = &headers[field_n]; + let mut transformed_field_value = Some(field_value.clone()); + // Try each rule in order of priority and test to see if it is applicable. + for rule in self.rules.iter() { + let new_value = match transformed_field_value { + Some(ref fv) => { + let transform_result = rule.apply(fv, &field_name, record_n); + match transform_result { + Ok(tfv) => tfv, + Err(e) => { + errors.push(e); + None + } + } + }, + // The last transformer returned None, so we can short circuit and just + // return None for the field value. + None => break + }; + transformed_field_value = new_value; + } + transformed_fields.insert(field_n, transformed_field_value); + } else { + errors.push( + TransformError { + field_value: field_value.to_string(), + field_name: field_n.to_string(), + record_n: record_n, + reason: format!("found {} header fields but record had extra field at position {}", expected_n_fields, field_n) + } + ); + } + } + + TransformedRecord { + field_values: transformed_fields, + errors: errors, + } + } +} + +/// Error for when a `Ruleset` does not validate against a CSV file. +#[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Hash, Debug)] +pub struct ValidationError { + reason: String, +} + +impl Display for ValidationError +{ + fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { + write!(formatter, "{}", self.reason) + } +} + +impl error::Error for ValidationError +{ + fn description(&self) -> &str { + &self.reason + } +} + +/// A single processed and transformed record. +#[derive(Serialize, Deserialize, Eq, PartialEq, Hash, Debug)] +pub struct TransformedRecord { + /// Transformed fields for the record. + /// + /// Empty field are explicitly encoded as `None` values. + pub field_values: Vec>, + /// Errors that were encountered during transformation, if any. + pub errors: Vec, +} diff --git a/src/transformer.rs b/src/transformer.rs new file mode 100644 index 0000000..98ee771 --- /dev/null +++ b/src/transformer.rs @@ -0,0 +1,91 @@ +//! Traits and types that define transformations on CSV record fields. + +use std::result; +use std::error; +use std::fmt::{ + self, + Formatter, + Display, +}; + +/// `Result` for the transformation of a CSV record's field, either an `Option` if +/// successfully transformed or a `TransformError` if unsuccessful. +pub type TransformResult = result::Result, TransformError>; + +/// Helper trait with a few useful utility methods for constructing `TransformResult`. +pub trait TransformResultHelper +{ + /// Construct a `TransformResult` that represents a successful transformation of a CSV record's + /// field with a non-empty value. + fn present(value: &str) -> TransformResult { + Ok(Some(value.to_string())) + } + + /// Construct a `TransformResult` that represents a successful tranformation of a CSV record's + /// field with an empty value. + fn excluded() -> TransformResult { + Ok(None) + } + + /// Construct a `TransformResult` that represents a failed transformation of a CSV record's + /// field with a descritive error reason. + /// + /// An error reason should be a short, single sentence without punctuation or capitization, + /// e.g. "not a valid email address" instead of "The email address was invalid.". + /// + /// ``` + /// use csv_sanity::transformer::{ + /// TransformResult, + /// TransformError, + /// TransformResultHelper, + /// }; + /// + /// let result = TransformResult::error("jak,.@hot mail.com", "Email", 0, "not a valid email address"); + /// assert_eq!(result, Err(TransformError { + /// field_value: "jak,.@hot mail.com".to_string(), + /// field_name: "Email".to_string(), + /// record_n: 0, + /// reason: "not a valid email address".to_string(), + /// })); + /// ``` + fn error(field_value: &str, field_name: &str, record_n: usize, reason: &str) -> TransformResult { + Err( + TransformError { + field_value: field_value.to_string(), + field_name: field_name.to_string(), + record_n: record_n, + reason: reason.to_string(), + } + ) + } +} + +impl TransformResultHelper for TransformResult {} + +pub trait Transformer +{ + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult; +} + +#[derive(RustcEncodable, Deserialize, Serialize, Clone, PartialEq, Eq, Hash, Debug)] +pub struct TransformError +{ + pub record_n: usize, + pub field_name: String, + pub field_value: String, + pub reason: String, +} + +impl Display for TransformError +{ + fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { + write!(formatter, "failed to transform field: {}", self.reason) + } +} + +impl error::Error for TransformError +{ + fn description(&self) -> &str { + &self.reason + } +} diff --git a/src/transformers/capitalize.rs b/src/transformers/capitalize.rs new file mode 100644 index 0000000..8b4d627 --- /dev/null +++ b/src/transformers/capitalize.rs @@ -0,0 +1,41 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; + +use unicode_segmentation::UnicodeSegmentation; + +pub fn capitalize(string: &str) -> String +{ + string.unicode_words() + .map(capitalize_word).collect::>() + .join(" ") +} + +fn capitalize_word(word: &str) -> String +{ + word.chars().enumerate() + .map(|(i, c)| if i == 0 { c.to_uppercase().collect::() } else { c.to_lowercase().collect() }) + .collect() +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct CapitalizeTransformer {} + +impl CapitalizeTransformer +{ + pub fn new() -> CapitalizeTransformer + { + CapitalizeTransformer {} + } +} + +impl Transformer for CapitalizeTransformer +{ + fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult + { + let result = capitalize(field_value); + TransformResult::present(&result) + } +} diff --git a/src/transformers/choice.rs b/src/transformers/choice.rs new file mode 100644 index 0000000..f3dc7c9 --- /dev/null +++ b/src/transformers/choice.rs @@ -0,0 +1,37 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct ChoiceTransformer { + choices: Vec, +} + +impl ChoiceTransformer +{ + pub fn new(choices: Vec) -> ChoiceTransformer + { + ChoiceTransformer { + choices: choices, + } + } +} + +impl Transformer for ChoiceTransformer +{ + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult + { + if self.choices.contains(&field_value.to_string()) { + TransformResult::present(&field_value) + } else { + TransformResult::error( + field_value, + field_name, + record_n, + &format!("not in valid choices {:?}", self.choices) + ) + } + } +} diff --git a/src/transformers/date.rs b/src/transformers/date.rs new file mode 100644 index 0000000..b8b294d --- /dev/null +++ b/src/transformers/date.rs @@ -0,0 +1,41 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; + +use time::{ + strptime +}; + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct DateTransformer { + input_formats: Vec, + output_format: String +} + +impl DateTransformer { + pub fn new(input_formats: Vec, output_format: &str) -> DateTransformer { + DateTransformer { + input_formats: input_formats, + output_format: output_format.to_string() + } + } + + pub fn with_iso8601_output(input_formats: Vec) -> DateTransformer { + Self::new(input_formats, "%F") + } +} + +impl Transformer for DateTransformer { + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult { + for format in self.input_formats.iter() { + if let Ok(time) = strptime(field_value, &format) { + return TransformResult::present( + &format!("{}", time.strftime(&self.output_format).unwrap()) + ); + } + } + TransformResult::error(field_value, field_name, record_n, "unable to parse as date") + } +} diff --git a/src/transformers/email.rs b/src/transformers/email.rs new file mode 100644 index 0000000..5ee5760 --- /dev/null +++ b/src/transformers/email.rs @@ -0,0 +1,30 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; + +use regex::Regex; + +lazy_static! { + static ref EMAIL_REGEX: Regex = Regex::new(r"(?i)\A[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\z").unwrap(); +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct EmailTransformer {} + +impl EmailTransformer { + pub fn new() -> EmailTransformer { + EmailTransformer {} + } +} + +impl Transformer for EmailTransformer { + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult { + if EMAIL_REGEX.is_match(field_value) { + TransformResult::present(&field_value.to_lowercase()) + } else { + TransformResult::error(field_value, field_name, record_n, "invalid email address") + } + } +} diff --git a/src/transformers/mod.rs b/src/transformers/mod.rs new file mode 100644 index 0000000..9c74023 --- /dev/null +++ b/src/transformers/mod.rs @@ -0,0 +1,76 @@ +use transformer::{ + Transformer, + TransformResult, +}; + +mod trim; +pub use self::trim::TrimTransformer; + +mod none; +pub use self::none::NoneTransformer; + +mod regex; +pub use self::regex::{ + RegexTransformer, + RegexMatchTransformer +}; + +mod capitalize; +pub use self::capitalize::{ + CapitalizeTransformer, + capitalize +}; + +mod email; +pub use self::email::EmailTransformer; + +mod number; +pub use self::number::NumberTransformer; + +mod date; +pub use self::date::DateTransformer; + +mod choice; +pub use self::choice::ChoiceTransformer; + +mod zipcode; +pub use self::zipcode::ZipcodeTransformer; + +mod phone_number; +pub use self::phone_number::PhoneNumberTransformer; + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub enum Transformers +{ + Trim(TrimTransformer), + None(NoneTransformer), + Regex(RegexTransformer), + RegexMatch(RegexMatchTransformer), + Capitalize(CapitalizeTransformer), + Email(EmailTransformer), + Number(NumberTransformer), + Date(DateTransformer), + Choice(ChoiceTransformer), + Zipcode(ZipcodeTransformer), + PhoneNumber(PhoneNumberTransformer), +} + +impl Transformer for Transformers { + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult { + use self::Transformers::*; + + match *self { + Trim(ref t) => t.transform(field_value, field_name, record_n), + None(ref t) => t.transform(field_value, field_name, record_n), + Regex(ref t) => t.transform(field_value, field_name, record_n), + RegexMatch(ref t) => t.transform(field_value, field_name, record_n), + Capitalize(ref t) => t.transform(field_value, field_name, record_n), + Email(ref t) => t.transform(field_value, field_name, record_n), + Number(ref t) => t.transform(field_value, field_name, record_n), + Date(ref t) => t.transform(field_value, field_name, record_n), + Choice(ref t) => t.transform(field_value, field_name, record_n), + Zipcode(ref t) => t.transform(field_value, field_name, record_n), + PhoneNumber(ref t) => t.transform(field_value, field_name, record_n) + } + } +} diff --git a/src/transformers/none.rs b/src/transformers/none.rs new file mode 100644 index 0000000..67a2250 --- /dev/null +++ b/src/transformers/none.rs @@ -0,0 +1,34 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; +use newtypes::Regex; + +use regex; + + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct NoneTransformer { + regex: Regex +} + +impl NoneTransformer { + pub fn new(regex: regex::Regex) -> NoneTransformer { + NoneTransformer { regex: Regex::from(regex) } + } + + pub fn with_blank_matcher() -> NoneTransformer { + Self::new(regex::Regex::new(r"\A(?:[:cntrl:]|\s)*\z").unwrap()) + } +} + +impl Transformer for NoneTransformer { + fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult { + if self.regex.is_match(field_value) { + TransformResult::excluded() + } else { + TransformResult::present(field_value) + } + } +} diff --git a/src/transformers/number.rs b/src/transformers/number.rs new file mode 100644 index 0000000..070714f --- /dev/null +++ b/src/transformers/number.rs @@ -0,0 +1,30 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; + +use regex::Regex; + +lazy_static! { + static ref INTEGER_REGEX: Regex = Regex::new(r"\A(:?0|[1-9]\d*)\z").unwrap(); +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct NumberTransformer { } + +impl NumberTransformer { + pub fn match_integer() -> NumberTransformer { + NumberTransformer { } + } +} + +impl Transformer for NumberTransformer { + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult { + if INTEGER_REGEX.is_match(field_value) { + TransformResult::present(field_value) + } else { + TransformResult::error(field_value, field_name, record_n, "not a valid number") + } + } +} diff --git a/src/transformers/phone_number.rs b/src/transformers/phone_number.rs new file mode 100644 index 0000000..cd81f25 --- /dev/null +++ b/src/transformers/phone_number.rs @@ -0,0 +1,34 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; + +use regex::Regex; + +lazy_static! { + static ref NANP_REGEX: Regex = Regex::new(r"\A(?:\+?1)?\D*\(?(?P\d{3})\)?\D*(?P\d{3})\D*(?P\d{4})\z").unwrap(); +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct PhoneNumberTransformer { } + +impl PhoneNumberTransformer { + pub fn expect_nanp_format() -> PhoneNumberTransformer { + PhoneNumberTransformer { } + } +} + +impl Transformer for PhoneNumberTransformer { + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult { + if let Some(captures) = NANP_REGEX.captures(field_value) { + let area_code = captures.name("area").unwrap().as_str(); + let exchange_code = captures.name("exchange").unwrap().as_str(); + let subscriber_number = captures.name("subscriber").unwrap().as_str(); + let phone_number = format!("+1 {} {} {}", area_code, exchange_code, subscriber_number); + TransformResult::present(&phone_number) + } else { + TransformResult::error(field_value, field_name, record_n, "not a valid NANP format phone number") + } + } +} diff --git a/src/transformers/regex.rs b/src/transformers/regex.rs new file mode 100644 index 0000000..06e5268 --- /dev/null +++ b/src/transformers/regex.rs @@ -0,0 +1,88 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; +use newtypes::Regex; + +use regex; + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct RegexTransformer +{ + regex: Regex, + template: String +} + +impl RegexTransformer +{ + pub fn new(regex: regex::Regex, template: &str) -> RegexTransformer { + RegexTransformer { + regex: Regex::from(regex), + template: template.to_string() + } + } +} + +impl Transformer for RegexTransformer +{ + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult { + if let Some(captures) = self.regex.captures(field_value) { + let mut expansion = String::new(); + captures.expand(&self.template, &mut expansion); + TransformResult::present(&expansion) + } else { + TransformResult::error( + field_value, + field_name, + record_n, + &format!("did not match pattern {}", self.regex) + ) + } + } +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct RegexMatchTransformer +{ + regex: Regex, + negate: bool +} + +impl RegexMatchTransformer +{ + pub fn matching(regex: regex::Regex) -> RegexMatchTransformer { + RegexMatchTransformer { + regex: Regex::from(regex), + negate: false + } + } + + pub fn not_matching(regex: regex::Regex) -> RegexMatchTransformer { + RegexMatchTransformer { + regex: Regex::from(regex), + negate: true + } + } +} + +impl Transformer for RegexMatchTransformer +{ + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult { + let mut is_match = self.regex.is_match(field_value); + if self.negate { + is_match = !is_match; + } + + if is_match { + TransformResult::present(field_value) + } else { + let reason = if self.negate { + format!("matched exclusionary pattern {}", self.regex) + } else { + format!("did not match pattern {}", self.regex) + }; + TransformResult::error(field_value, field_name, record_n, &reason) + } + } +} diff --git a/src/transformers/trim.rs b/src/transformers/trim.rs new file mode 100644 index 0000000..63b4a45 --- /dev/null +++ b/src/transformers/trim.rs @@ -0,0 +1,20 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct TrimTransformer {} + +impl TrimTransformer { + pub fn new() -> TrimTransformer { + TrimTransformer {} + } +} + +impl Transformer for TrimTransformer { + fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult { + TransformResult::present(field_value.trim()) + } +} diff --git a/src/transformers/zipcode.rs b/src/transformers/zipcode.rs new file mode 100644 index 0000000..439a887 --- /dev/null +++ b/src/transformers/zipcode.rs @@ -0,0 +1,37 @@ +use Transformer; +use transformer::{ + TransformResultHelper, + TransformResult +}; + +use regex::Regex; + +lazy_static! { + static ref ZIP_REGEX: Regex = Regex::new(r"\A(\d{5})\D*(?:(\d{4}))?\z").unwrap(); +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)] +pub struct ZipcodeTransformer { } + +impl ZipcodeTransformer { + pub fn new() -> ZipcodeTransformer { + ZipcodeTransformer { } + } +} + +impl Transformer for ZipcodeTransformer { + fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult { + if let Some(captures) = ZIP_REGEX.captures(field_value) { + let base_code = captures.get(1).unwrap(); + let plus_four_code = captures.get(2); + let zipcode = if let Some(pfc) = plus_four_code { + format!("{}-{}", base_code.as_str(), pfc.as_str()) + } else { + base_code.as_str().to_string() + }; + TransformResult::present(&zipcode) + } else { + TransformResult::error(field_value, field_name, record_n, "not a valid zipcode") + } + } +}