Initial commit

This commit is contained in:
M. George Hansen 2019-02-06 12:24:39 -08:00
commit 26c5433d16
Signed by: mgeorgehansen
SSH key fingerprint: SHA256:JlIGiQLPyQ2RHTH3a2oVlb20Xkh9Glr8DUF4YTXHJxM
21 changed files with 1968 additions and 0 deletions

193
src/cli.rs Normal file
View file

@ -0,0 +1,193 @@
//! Command line interface.
use std::fs::File;
use std::path::Path;
use {
Ruleset,
TransformError,
TransformedRecord,
};
use csv;
/// Configuration options for the `Cli`.
pub struct Options
{
/// See `CsvOptions`.
pub csv_options: CsvOptions,
}
impl Default for Options {
fn default() -> Options {
Options {
csv_options: Default::default(),
}
}
}
/// `Cli` configuration options specific to how to parse the CSV file.
///
/// `CsvOptions` implements `Default` with the following defaults:
///
/// ```
/// extern crate csv;
/// use csv_sanity::cli::CsvOptions;
/// use csv::RecordTerminator;
///
/// let defaults = CsvOptions {
/// delimiter: b',',
/// record_terminator: csv::RecordTerminator::CRLF,
/// quote: b'"',
/// escape: None,
/// double_quote: true,
/// };
/// assert_eq!(defaults, Default::default());
/// ```
pub struct CsvOptions
{
/// Field delimeter to expect in the CSV file.
///
/// Corresponds to the `csv::Reader.delimiter` method.
pub delimiter: u8,
/// Record terminator to expect in the CSV file.
///
/// Corresponds to the `csv::Reader.record_terminator` method. See `csv::RecordTerminator`.
pub record_terminator: csv::RecordTerminator,
/// Field quotation character to expect in the CSV file.
///
/// Corresponds to the `csv::Reader.quote` method.
pub quote: u8,
/// Escape character to expect in the CSV file.
///
/// Corresponds to the `csv::Reader.escape` method.
pub escape: Option<u8>,
/// Whether two adjacent quote characters should be interpreted as an escaped quote character.
///
/// Corresponds to the `csv::Reader.double_quote` method.
pub double_quote: bool
}
impl Default for CsvOptions
{
fn default() -> CsvOptions {
CsvOptions {
delimiter: b',',
record_terminator: csv::RecordTerminator::CRLF,
quote: b'"',
escape: None,
double_quote: true,
}
}
}
/// Command line interface for running a `Ruleset` against a CSV file.
pub struct Cli
{
options: Options,
ruleset: Ruleset,
}
impl Cli
{
/// Construct a new `Cli` with default options.
///
/// ```
/// use csv_sanity::Ruleset;
/// use csv_sanity::cli::{
/// Cli
/// };
///
/// let ruleset = Ruleset::new();
/// let cli = Cli::new(ruleset);
/// ```
pub fn new(ruleset: Ruleset) -> Cli {
Self::new_with_options(ruleset, Default::default())
}
/// Construct a new `Cli` with the specified options.
///
/// ```
/// use csv_sanity::Ruleset;
/// use csv_sanity::cli::{
/// Cli,
/// Options,
/// CsvOptions
/// };
///
/// let ruleset = Ruleset::new();
/// let cli = Cli::new_with_options(ruleset, Options {
/// csv_options: CsvOptions {
/// delimiter: b',',
/// .. Default::default()
/// },
/// .. Default::default()
/// });
/// ```
pub fn new_with_options(ruleset: Ruleset, options: Options) -> Cli {
Cli {
options: options,
ruleset: ruleset,
}
}
pub fn run<I: AsRef<Path>, O: AsRef<Path>, E: AsRef<Path>>(&self, input_file_path: I, output_file_name: O, error_file_name: E) {
let (mut reader, headers) = self.reader_from_file(input_file_path);
let mut output_writer = csv::Writer::from_file(output_file_name).expect("Unable to open output file for writing");
let mut output_headers = headers.clone();
output_headers.insert(0, "Record Number".to_string());
output_writer.encode(output_headers).expect("Unable to write to output file");
let mut error_writer = csv::Writer::from_file(error_file_name).expect("Unable to open error file for writing");
let error_headers = vec![
"Record Number",
"Field Name",
"Field Value",
"Reason",
];
error_writer.encode(error_headers).expect("Unable to write to error file");
for (record_n, record) in reader.records().enumerate() {
let original_line_n = record_n + 2; // Plus one for headers and plus one for zero-indexing.
let transformed_record: TransformedRecord = match record {
Err(e) => {
let err = TransformError {
field_value: "".to_string(),
field_name: "".to_string(),
record_n: original_line_n,
reason: format!("{}", e),
};
error_writer.encode(err).expect("Unable to write to error file");
continue;
},
Ok(ref rec) => self.ruleset.apply_rules(&headers, rec, original_line_n)
};
let record_fields: Vec<Option<String>> = {
let mut fs = vec![Some(original_line_n.to_string())];
fs.extend(transformed_record.field_values);
fs
};
output_writer.encode(record_fields).expect("Unable to write to output file");
for error in transformed_record.errors {
error_writer.encode(error).expect("Unable to write to error file");
}
}
}
fn reader_from_file<P: AsRef<Path>>(&self, path: P) -> (csv::Reader<File>, Vec<String>) {
let mut reader = csv::Reader::from_file(path.as_ref().clone()).map(|r| {
// Configure the reader according to the options passed to the Cli constructor.
r.has_headers(true)
.delimiter(self.options.csv_options.delimiter)
.record_terminator(self.options.csv_options.record_terminator)
.quote(self.options.csv_options.quote)
.escape(self.options.csv_options.escape)
.double_quote(self.options.csv_options.double_quote)
.flexible(true)
}).expect(&format!("Unable to read file {}", path.as_ref().display()));
let headers = reader.headers()
.expect(&format!("Unable to read headers from input file {}", path.as_ref().display()));
(reader, headers)
}
}

36
src/lib.rs Normal file
View file

@ -0,0 +1,36 @@
#[macro_use]
extern crate serde_derive;
extern crate serde;
extern crate serde_json;
extern crate regex;
#[macro_use]
extern crate lazy_static;
extern crate unicode_segmentation;
extern crate time;
extern crate csv;
#[macro_use]
extern crate custom_derive;
#[macro_use]
extern crate newtype_derive;
extern crate rustc_serialize;
mod newtypes;
pub mod transformer;
pub use transformer::{
Transformer,
TransformResult,
TransformResultHelper,
TransformError
};
pub mod transformers;
mod ruleset;
pub use ruleset::{
Rule,
Ruleset,
TransformedRecord,
};
pub mod cli;

111
src/main.rs Normal file
View file

@ -0,0 +1,111 @@
extern crate csv_sanity;
extern crate serde_json;
#[macro_use]
extern crate log;
extern crate regex;
#[macro_use]
extern crate clap;
use csv_sanity::cli::{
self,
Cli,
};
use std::fs::File;
use std::path::Path;
use log::{
LogRecord,
LogLevel,
LogMetadata,
LogLevelFilter,
SetLoggerError
};
use clap::{
App,
Arg
};
struct ConsoleLogger {
log_level: LogLevel
}
impl log::Log for ConsoleLogger {
fn enabled(&self, metadata: &LogMetadata) -> bool {
metadata.level() <= self.log_level
}
fn log(&self, record: &LogRecord) {
if self.enabled(record.metadata()) {
println!("{} - {}", record.level(), record.args())
}
}
}
fn init_logging() -> Result<(), SetLoggerError> {
log::set_logger(|max_log_level| {
max_log_level.set(LogLevelFilter::Info);
Box::new(ConsoleLogger { log_level: LogLevel::Info })
})
}
fn main() {
init_logging().unwrap();
let matches = App::new("Convert CSV")
.version(crate_version!())
.author("M. George Hansen <technopolitica@gmail.com>")
.about("Apply a set of transformations to the records in a CSV file, attempting to read a much valid information from the file as possible.")
.arg(Arg::with_name("INPUT_FILE")
.help("CSV file to process")
.required(true)
.index(1))
.arg(Arg::with_name("output")
.help("File to output the transformed CSV records. Defaults to ./output.csv")
.short("o")
.long("output")
.takes_value(true))
.arg(Arg::with_name("error_output")
.help("File to output errors in CSV format. Defaults to ./errors.csv")
.short("e")
.long("error_output")
.takes_value(true))
.arg(Arg::with_name("ruleset")
.help("JSON file containing the ruleset to apply. Defaults to ./ruleset.json")
.short("r")
.long("ruleset")
.takes_value(true))
.get_matches();
let ruleset_file_path = Path::new(matches.value_of("ruleset").unwrap_or("ruleset.json"));
let ruleset_file = match File::open(ruleset_file_path) {
Ok(f) => f,
Err(e) => exit_with_error(&format!("unable to read ruleset file {}: {}", ruleset_file_path.display(), e))
};
let ruleset = match serde_json::from_reader(ruleset_file) {
Ok(r) => r,
Err(e) => {
exit_with_error(&format!("failed to parse ruleset from {}: {}", ruleset_file_path.display(), e));
}
};
let cli_app = Cli::new_with_options(ruleset, cli::Options {
csv_options: cli::CsvOptions {
delimiter: b'\t',
.. Default::default()
},
.. Default::default()
});
// NOTE: Required arguments are validated by clap, so we should be safe to use expect here.
let input_file_name = matches.value_of("INPUT_FILE").expect("INPUT_FILE argument could not be found!");
let output_file_name = matches.value_of("output_file").unwrap_or("output.csv");
let error_file_name = matches.value_of("error_file").unwrap_or("errors.csv");
cli_app.run(input_file_name, output_file_name, error_file_name);
}
fn exit_with_error(error_msg: &str) -> !
{
error!("{}", error_msg);
std::process::exit(1);
}

59
src/newtypes.rs Normal file
View file

@ -0,0 +1,59 @@
use std::hash::{
Hash,
Hasher,
};
use regex;
use serde::{
Serialize,
Serializer,
Deserialize,
Deserializer,
};
custom_derive! {
#[derive(NewtypeFrom, NewtypeDeref, NewtypeDerefMut, Clone, NewtypeDisplay, NewtypeDebug)]
pub struct Regex(regex::Regex);
}
impl PartialEq for Regex {
fn eq(&self, other: &Regex) -> bool
{
self.0.as_str() == other.0.as_str()
}
}
impl Eq for Regex {}
impl Hash for Regex {
fn hash<H>(&self, state: &mut H)
where H: Hasher {
self.as_str().hash(state);
}
}
impl Serialize for Regex
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer {
let Regex(ref regex) = *self;
regex.as_str().serialize(serializer)
}
}
impl<'de> Deserialize<'de> for Regex
{
fn deserialize<D>(deserializer: D) -> Result<Regex, D::Error>
where D: Deserializer<'de>
{
use serde::de::{Unexpected, Error};
let string: Result<String, D::Error> = Deserialize::deserialize(deserializer);
string.and_then(|s| {
regex::Regex::new(&s)
.map(|r| Regex(r))
.map_err(|e| {
let message: &str = &format!("invalid regex string: {}", e);
D::Error::invalid_value(Unexpected::Str(&s), &message)
})
})
}
}

345
src/ruleset.rs Normal file
View file

@ -0,0 +1,345 @@
use Transformer;
use transformer::{
TransformResult,
TransformError,
};
use transformers::{
Transformers,
TrimTransformer,
NoneTransformer,
};
use std::hash::{
Hash,
Hasher,
};
use std::iter::FromIterator;
use std::cmp::Ordering;
use std::collections::{
BinaryHeap,
HashSet,
};
use std::error;
use std::fmt::{
self,
Formatter,
Display,
};
/// Applicability of a `Rule` determining which CSV record's fields it can be applied to.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub enum Applicability {
/// Applicable to all CSV record fields.
Global,
/// Applicable to a subset of a CSV record's fields, specified by field name.
Fields {
field_names: HashSet<String>
}
}
impl Hash for Applicability {
fn hash<H>(&self, state: &mut H)
where H: Hasher {
use self::Applicability::*;
match *self {
Global => (self as *const Applicability).hash(state), // FIXME: Is this the correct way to hash an empty enum variant?
Fields { ref field_names } => field_names.iter().collect::<Vec<&String>>().hash(state)
}
}
}
fn priority_is_default(priority: &isize) -> bool {
priority == &0
}
/// A `Transformer` paired with `Applicability` and a priority which can be applied to fields in a
/// CSV record.
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct Rule
{
applicability: Applicability,
transformer: Transformers,
#[serde(default, skip_serializing_if="priority_is_default")]
priority: isize
}
impl Rule
{
/// Construct a new `Rule` whoe `Transformer` is applicable to one or more CSV record's fields
/// referenced by name with the default priority of 0.
///
/// # Examples
/// ```
/// use csv_sanity::Rule;
/// use csv_sanity::transformers::*;
///
/// let rule = Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize(
/// CapitalizeTransformer::new()
/// ));
/// ```
pub fn for_fields(field_names: &[&str], transformer: Transformers) -> Rule {
Self::for_fields_with_priority(field_names, transformer, Default::default())
}
/// Construct a new `Rule` whoe `Transformer` is applicable to one or more CSV record's fields
/// referenced by name with the specified priority.
///
/// # Examples
/// ```
/// use csv_sanity::Rule;
/// use csv_sanity::transformers::*;
///
/// let rule = Rule::for_fields_with_priority(&["Fist Name", "Last Name"], Transformers::Capitalize(
/// CapitalizeTransformer::new()
/// ), 10);
/// ```
pub fn for_fields_with_priority(field_names: &[&str], transformer: Transformers, priority: isize) -> Rule {
Rule {
applicability: Applicability::Fields { field_names: field_names.iter().map(|s| s.to_string()).collect() },
transformer: transformer,
priority: priority
}
}
/// Construct a new `Rule` applicable to all of a CSV record's fields with the default priority
/// of 0.
///
/// # Examples
/// ```
/// use csv_sanity::Rule;
/// use csv_sanity::transformers::*;
///
/// let rule = Rule::global(Transformers::Capitalize(
/// CapitalizeTransformer::new()
/// ));
/// ```
pub fn global(transformer: Transformers) -> Rule {
Self::global_with_priority(transformer, Default::default())
}
/// Construct a new `Rule` applicable to all of a CSV record's fields with the specified
/// priority.
///
/// # Examples
/// ```
/// use csv_sanity::Rule;
/// use csv_sanity::transformers::*;
///
/// let rule = Rule::global_with_priority(Transformers::Capitalize(
/// CapitalizeTransformer::new()
/// ), 10);
/// ```
pub fn global_with_priority(transformer: Transformers, priority: isize) -> Rule {
Rule {
applicability: Applicability::Global,
transformer: transformer,
priority: priority
}
}
/// Apply this rule to a CSV record's field, returning the resulting `TransformResult`.
///
/// # Examples
/// ```
/// use csv_sanity::Rule;
/// use csv_sanity::transformers::*;
///
/// let field = "JOHN";
/// let field_name = "First Name";
///
/// let rule = Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize(
/// CapitalizeTransformer::new()
/// ));
/// rule.apply(field, field_name, 1);
/// ```
pub fn apply(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
// XXX: Does the applicability check belong inside the apply method? Or should the caller
// decide?
match self.applicability {
Applicability::Global => self.transformer.transform(field_value, field_name, record_n),
Applicability::Fields { ref field_names } if field_names.contains(&field_name.to_string()) => {
self.transformer.transform(field_value, field_name, record_n)
},
_ => Ok(Some(field_value.to_string()))
}
}
}
impl Ord for Rule
{
fn cmp(&self, other: &Self) -> Ordering {
other.priority.cmp(&self.priority)
}
}
impl PartialOrd for Rule
{
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// An ordered set of `Rule`s sorted by priority.
///
/// # Examples
/// ```
/// use csv_sanity::{
/// Ruleset,
/// Rule,
/// TransformedRecord,
/// };
/// use csv_sanity::transformers::*;
/// let ruleset = {
/// let mut r = Ruleset::new();
/// r.add_rule(Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize(
/// CapitalizeTransformer::new()
/// )));
/// r.add_rule(Rule::for_fields(&["Email"], Transformers::Email(
/// EmailTransformer::new()
/// )));
/// r
/// };
/// let headers = vec!["Id", "First Name", "Last Name", "Email"].iter().map(|s| s.to_string()).collect();
/// let record = vec!["1", " JOHN", "SNOW ", "\t JSNOW@EXAMPLE.COM "].iter().map(|s| s.to_string()).collect();
/// let transformed_record = ruleset.apply_rules(&headers, &record, 1);
/// assert_eq!(TransformedRecord {
/// field_values: vec!["1", "John", "Snow", "jsnow@example.com"].iter().map(|s| Some(s.to_string())).collect(),
/// errors: Vec::new(),
/// }, transformed_record);
/// ```
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct Ruleset {
rules: BinaryHeap<Rule>
}
impl Ruleset {
/// Construct a new `Ruleset` with a default `NoneTransformer` and `TrimTransformer` global
/// rules.
///
/// The default trim and none rules should be appropriate for most CSV files. For CSV files
/// where these default rules are not desired use the `Ruleset::without_default_rules` method.
pub fn new() -> Ruleset {
let mut ruleset = Self::without_default_rules();
// Add a default trim rule and blank rule to match empty fields.
ruleset.add_rule(Rule::global_with_priority(Transformers::None(NoneTransformer::with_blank_matcher()), -10));
ruleset.add_rule(Rule::global_with_priority(Transformers::Trim(TrimTransformer::new()), -10));
ruleset
}
/// Construct a new `Ruleset` without any of the default rules.
pub fn without_default_rules() -> Ruleset {
Ruleset {
rules: BinaryHeap::new()
}
}
/// Add a `Rule` to the this ruleset.
pub fn add_rule(&mut self, rule: Rule) {
self.rules.push(rule);
}
/// Validate this ruleset against a CSV file by comparing it's `Rule`s against the headers.
pub fn validate_rules(&self, headers: &Vec<String>) -> Result<(), Vec<ValidationError>> {
let mut errors = Vec::new();
for rule in self.rules.iter() {
if let Applicability::Fields { ref field_names } = rule.applicability {
let header_set = HashSet::<String>::from_iter(headers.clone());
let field_set = HashSet::<String>::from_iter(field_names.clone());
let diff: HashSet<String> = field_set.difference(&header_set).cloned().collect();
if diff.len() > 0 {
// FIXME: We should have a better way to construct a ruleset that uses Result
// instead of panic! here.
errors.push(
ValidationError {
reason: format!("The following fields were not found in headers: '{:?}'", diff),
}
)
}
}
}
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
}
/// Apply this `Ruleset` to a record from a CSV file.
pub fn apply_rules(&self, headers: &Vec<String>, fields: &Vec<String>, record_n: usize) -> TransformedRecord {
let expected_n_fields = headers.len();
let mut errors: Vec<TransformError> = Vec::new();
let mut transformed_fields: Vec<Option<String>> = Vec::new();
for (field_n, field_value) in fields.iter().enumerate() {
if field_n < expected_n_fields {
let field_name = &headers[field_n];
let mut transformed_field_value = Some(field_value.clone());
// Try each rule in order of priority and test to see if it is applicable.
for rule in self.rules.iter() {
let new_value = match transformed_field_value {
Some(ref fv) => {
let transform_result = rule.apply(fv, &field_name, record_n);
match transform_result {
Ok(tfv) => tfv,
Err(e) => {
errors.push(e);
None
}
}
},
// The last transformer returned None, so we can short circuit and just
// return None for the field value.
None => break
};
transformed_field_value = new_value;
}
transformed_fields.insert(field_n, transformed_field_value);
} else {
errors.push(
TransformError {
field_value: field_value.to_string(),
field_name: field_n.to_string(),
record_n: record_n,
reason: format!("found {} header fields but record had extra field at position {}", expected_n_fields, field_n)
}
);
}
}
TransformedRecord {
field_values: transformed_fields,
errors: errors,
}
}
}
/// Error for when a `Ruleset` does not validate against a CSV file.
#[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Hash, Debug)]
pub struct ValidationError {
reason: String,
}
impl Display for ValidationError
{
fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
write!(formatter, "{}", self.reason)
}
}
impl error::Error for ValidationError
{
fn description(&self) -> &str {
&self.reason
}
}
/// A single processed and transformed record.
#[derive(Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
pub struct TransformedRecord {
/// Transformed fields for the record.
///
/// Empty field are explicitly encoded as `None` values.
pub field_values: Vec<Option<String>>,
/// Errors that were encountered during transformation, if any.
pub errors: Vec<TransformError>,
}

91
src/transformer.rs Normal file
View file

@ -0,0 +1,91 @@
//! Traits and types that define transformations on CSV record fields.
use std::result;
use std::error;
use std::fmt::{
self,
Formatter,
Display,
};
/// `Result` for the transformation of a CSV record's field, either an `Option<String>` if
/// successfully transformed or a `TransformError` if unsuccessful.
pub type TransformResult = result::Result<Option<String>, TransformError>;
/// Helper trait with a few useful utility methods for constructing `TransformResult`.
pub trait TransformResultHelper
{
/// Construct a `TransformResult` that represents a successful transformation of a CSV record's
/// field with a non-empty value.
fn present(value: &str) -> TransformResult {
Ok(Some(value.to_string()))
}
/// Construct a `TransformResult` that represents a successful tranformation of a CSV record's
/// field with an empty value.
fn excluded() -> TransformResult {
Ok(None)
}
/// Construct a `TransformResult` that represents a failed transformation of a CSV record's
/// field with a descritive error reason.
///
/// An error reason should be a short, single sentence without punctuation or capitization,
/// e.g. "not a valid email address" instead of "The email address was invalid.".
///
/// ```
/// use csv_sanity::transformer::{
/// TransformResult,
/// TransformError,
/// TransformResultHelper,
/// };
///
/// let result = TransformResult::error("jak,.@hot mail.com", "Email", 0, "not a valid email address");
/// assert_eq!(result, Err(TransformError {
/// field_value: "jak,.@hot mail.com".to_string(),
/// field_name: "Email".to_string(),
/// record_n: 0,
/// reason: "not a valid email address".to_string(),
/// }));
/// ```
fn error(field_value: &str, field_name: &str, record_n: usize, reason: &str) -> TransformResult {
Err(
TransformError {
field_value: field_value.to_string(),
field_name: field_name.to_string(),
record_n: record_n,
reason: reason.to_string(),
}
)
}
}
impl TransformResultHelper for TransformResult {}
pub trait Transformer
{
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult;
}
#[derive(RustcEncodable, Deserialize, Serialize, Clone, PartialEq, Eq, Hash, Debug)]
pub struct TransformError
{
pub record_n: usize,
pub field_name: String,
pub field_value: String,
pub reason: String,
}
impl Display for TransformError
{
fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
write!(formatter, "failed to transform field: {}", self.reason)
}
}
impl error::Error for TransformError
{
fn description(&self) -> &str {
&self.reason
}
}

View file

@ -0,0 +1,41 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
use unicode_segmentation::UnicodeSegmentation;
pub fn capitalize(string: &str) -> String
{
string.unicode_words()
.map(capitalize_word).collect::<Vec<String>>()
.join(" ")
}
fn capitalize_word(word: &str) -> String
{
word.chars().enumerate()
.map(|(i, c)| if i == 0 { c.to_uppercase().collect::<String>() } else { c.to_lowercase().collect() })
.collect()
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct CapitalizeTransformer {}
impl CapitalizeTransformer
{
pub fn new() -> CapitalizeTransformer
{
CapitalizeTransformer {}
}
}
impl Transformer for CapitalizeTransformer
{
fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult
{
let result = capitalize(field_value);
TransformResult::present(&result)
}
}

View file

@ -0,0 +1,37 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct ChoiceTransformer {
choices: Vec<String>,
}
impl ChoiceTransformer
{
pub fn new(choices: Vec<String>) -> ChoiceTransformer
{
ChoiceTransformer {
choices: choices,
}
}
}
impl Transformer for ChoiceTransformer
{
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult
{
if self.choices.contains(&field_value.to_string()) {
TransformResult::present(&field_value)
} else {
TransformResult::error(
field_value,
field_name,
record_n,
&format!("not in valid choices {:?}", self.choices)
)
}
}
}

41
src/transformers/date.rs Normal file
View file

@ -0,0 +1,41 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
use time::{
strptime
};
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct DateTransformer {
input_formats: Vec<String>,
output_format: String
}
impl DateTransformer {
pub fn new(input_formats: Vec<String>, output_format: &str) -> DateTransformer {
DateTransformer {
input_formats: input_formats,
output_format: output_format.to_string()
}
}
pub fn with_iso8601_output(input_formats: Vec<String>) -> DateTransformer {
Self::new(input_formats, "%F")
}
}
impl Transformer for DateTransformer {
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
for format in self.input_formats.iter() {
if let Ok(time) = strptime(field_value, &format) {
return TransformResult::present(
&format!("{}", time.strftime(&self.output_format).unwrap())
);
}
}
TransformResult::error(field_value, field_name, record_n, "unable to parse as date")
}
}

30
src/transformers/email.rs Normal file
View file

@ -0,0 +1,30 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
use regex::Regex;
lazy_static! {
static ref EMAIL_REGEX: Regex = Regex::new(r"(?i)\A[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\z").unwrap();
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct EmailTransformer {}
impl EmailTransformer {
pub fn new() -> EmailTransformer {
EmailTransformer {}
}
}
impl Transformer for EmailTransformer {
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
if EMAIL_REGEX.is_match(field_value) {
TransformResult::present(&field_value.to_lowercase())
} else {
TransformResult::error(field_value, field_name, record_n, "invalid email address")
}
}
}

76
src/transformers/mod.rs Normal file
View file

@ -0,0 +1,76 @@
use transformer::{
Transformer,
TransformResult,
};
mod trim;
pub use self::trim::TrimTransformer;
mod none;
pub use self::none::NoneTransformer;
mod regex;
pub use self::regex::{
RegexTransformer,
RegexMatchTransformer
};
mod capitalize;
pub use self::capitalize::{
CapitalizeTransformer,
capitalize
};
mod email;
pub use self::email::EmailTransformer;
mod number;
pub use self::number::NumberTransformer;
mod date;
pub use self::date::DateTransformer;
mod choice;
pub use self::choice::ChoiceTransformer;
mod zipcode;
pub use self::zipcode::ZipcodeTransformer;
mod phone_number;
pub use self::phone_number::PhoneNumberTransformer;
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub enum Transformers
{
Trim(TrimTransformer),
None(NoneTransformer),
Regex(RegexTransformer),
RegexMatch(RegexMatchTransformer),
Capitalize(CapitalizeTransformer),
Email(EmailTransformer),
Number(NumberTransformer),
Date(DateTransformer),
Choice(ChoiceTransformer),
Zipcode(ZipcodeTransformer),
PhoneNumber(PhoneNumberTransformer),
}
impl Transformer for Transformers {
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
use self::Transformers::*;
match *self {
Trim(ref t) => t.transform(field_value, field_name, record_n),
None(ref t) => t.transform(field_value, field_name, record_n),
Regex(ref t) => t.transform(field_value, field_name, record_n),
RegexMatch(ref t) => t.transform(field_value, field_name, record_n),
Capitalize(ref t) => t.transform(field_value, field_name, record_n),
Email(ref t) => t.transform(field_value, field_name, record_n),
Number(ref t) => t.transform(field_value, field_name, record_n),
Date(ref t) => t.transform(field_value, field_name, record_n),
Choice(ref t) => t.transform(field_value, field_name, record_n),
Zipcode(ref t) => t.transform(field_value, field_name, record_n),
PhoneNumber(ref t) => t.transform(field_value, field_name, record_n)
}
}
}

34
src/transformers/none.rs Normal file
View file

@ -0,0 +1,34 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
use newtypes::Regex;
use regex;
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct NoneTransformer {
regex: Regex
}
impl NoneTransformer {
pub fn new(regex: regex::Regex) -> NoneTransformer {
NoneTransformer { regex: Regex::from(regex) }
}
pub fn with_blank_matcher() -> NoneTransformer {
Self::new(regex::Regex::new(r"\A(?:[:cntrl:]|\s)*\z").unwrap())
}
}
impl Transformer for NoneTransformer {
fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult {
if self.regex.is_match(field_value) {
TransformResult::excluded()
} else {
TransformResult::present(field_value)
}
}
}

View file

@ -0,0 +1,30 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
use regex::Regex;
lazy_static! {
static ref INTEGER_REGEX: Regex = Regex::new(r"\A(:?0|[1-9]\d*)\z").unwrap();
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct NumberTransformer { }
impl NumberTransformer {
pub fn match_integer() -> NumberTransformer {
NumberTransformer { }
}
}
impl Transformer for NumberTransformer {
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
if INTEGER_REGEX.is_match(field_value) {
TransformResult::present(field_value)
} else {
TransformResult::error(field_value, field_name, record_n, "not a valid number")
}
}
}

View file

@ -0,0 +1,34 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
use regex::Regex;
lazy_static! {
static ref NANP_REGEX: Regex = Regex::new(r"\A(?:\+?1)?\D*\(?(?P<area>\d{3})\)?\D*(?P<exchange>\d{3})\D*(?P<subscriber>\d{4})\z").unwrap();
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct PhoneNumberTransformer { }
impl PhoneNumberTransformer {
pub fn expect_nanp_format() -> PhoneNumberTransformer {
PhoneNumberTransformer { }
}
}
impl Transformer for PhoneNumberTransformer {
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
if let Some(captures) = NANP_REGEX.captures(field_value) {
let area_code = captures.name("area").unwrap().as_str();
let exchange_code = captures.name("exchange").unwrap().as_str();
let subscriber_number = captures.name("subscriber").unwrap().as_str();
let phone_number = format!("+1 {} {} {}", area_code, exchange_code, subscriber_number);
TransformResult::present(&phone_number)
} else {
TransformResult::error(field_value, field_name, record_n, "not a valid NANP format phone number")
}
}
}

88
src/transformers/regex.rs Normal file
View file

@ -0,0 +1,88 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
use newtypes::Regex;
use regex;
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct RegexTransformer
{
regex: Regex,
template: String
}
impl RegexTransformer
{
pub fn new(regex: regex::Regex, template: &str) -> RegexTransformer {
RegexTransformer {
regex: Regex::from(regex),
template: template.to_string()
}
}
}
impl Transformer for RegexTransformer
{
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
if let Some(captures) = self.regex.captures(field_value) {
let mut expansion = String::new();
captures.expand(&self.template, &mut expansion);
TransformResult::present(&expansion)
} else {
TransformResult::error(
field_value,
field_name,
record_n,
&format!("did not match pattern {}", self.regex)
)
}
}
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct RegexMatchTransformer
{
regex: Regex,
negate: bool
}
impl RegexMatchTransformer
{
pub fn matching(regex: regex::Regex) -> RegexMatchTransformer {
RegexMatchTransformer {
regex: Regex::from(regex),
negate: false
}
}
pub fn not_matching(regex: regex::Regex) -> RegexMatchTransformer {
RegexMatchTransformer {
regex: Regex::from(regex),
negate: true
}
}
}
impl Transformer for RegexMatchTransformer
{
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
let mut is_match = self.regex.is_match(field_value);
if self.negate {
is_match = !is_match;
}
if is_match {
TransformResult::present(field_value)
} else {
let reason = if self.negate {
format!("matched exclusionary pattern {}", self.regex)
} else {
format!("did not match pattern {}", self.regex)
};
TransformResult::error(field_value, field_name, record_n, &reason)
}
}
}

20
src/transformers/trim.rs Normal file
View file

@ -0,0 +1,20 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct TrimTransformer {}
impl TrimTransformer {
pub fn new() -> TrimTransformer {
TrimTransformer {}
}
}
impl Transformer for TrimTransformer {
fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult {
TransformResult::present(field_value.trim())
}
}

View file

@ -0,0 +1,37 @@
use Transformer;
use transformer::{
TransformResultHelper,
TransformResult
};
use regex::Regex;
lazy_static! {
static ref ZIP_REGEX: Regex = Regex::new(r"\A(\d{5})\D*(?:(\d{4}))?\z").unwrap();
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
pub struct ZipcodeTransformer { }
impl ZipcodeTransformer {
pub fn new() -> ZipcodeTransformer {
ZipcodeTransformer { }
}
}
impl Transformer for ZipcodeTransformer {
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
if let Some(captures) = ZIP_REGEX.captures(field_value) {
let base_code = captures.get(1).unwrap();
let plus_four_code = captures.get(2);
let zipcode = if let Some(pfc) = plus_four_code {
format!("{}-{}", base_code.as_str(), pfc.as_str())
} else {
base_code.as_str().to_string()
};
TransformResult::present(&zipcode)
} else {
TransformResult::error(field_value, field_name, record_n, "not a valid zipcode")
}
}
}