Initial commit
This commit is contained in:
commit
26c5433d16
21 changed files with 1968 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
target
|
||||
Cargo.lock
|
||||
22
Cargo.toml
Normal file
22
Cargo.toml
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
[package]
|
||||
name = "csv-sanity"
|
||||
version = "0.1.0"
|
||||
authors = ["M. George Hansen <technopolitica@gmail.com>"]
|
||||
license = "MPL-2.0"
|
||||
maintenance = { status = "passively-maintained" }
|
||||
|
||||
[dependencies]
|
||||
csv = "0.15.0"
|
||||
clap = "2.23.3"
|
||||
log = "0.3.7"
|
||||
regex = "0.2.1"
|
||||
lazy_static = "0.2.8"
|
||||
unicode-segmentation = "1.1.0"
|
||||
time = "0.1.37"
|
||||
maplit = "0.1.4"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
custom_derive = "0.1.7"
|
||||
newtype_derive = "0.1.6"
|
||||
rustc-serialize = "0.3"
|
||||
373
LICENSE
Normal file
373
LICENSE
Normal file
|
|
@ -0,0 +1,373 @@
|
|||
Mozilla Public License Version 2.0
|
||||
==================================
|
||||
|
||||
1. Definitions
|
||||
--------------
|
||||
|
||||
1.1. "Contributor"
|
||||
means each individual or legal entity that creates, contributes to
|
||||
the creation of, or owns Covered Software.
|
||||
|
||||
1.2. "Contributor Version"
|
||||
means the combination of the Contributions of others (if any) used
|
||||
by a Contributor and that particular Contributor's Contribution.
|
||||
|
||||
1.3. "Contribution"
|
||||
means Covered Software of a particular Contributor.
|
||||
|
||||
1.4. "Covered Software"
|
||||
means Source Code Form to which the initial Contributor has attached
|
||||
the notice in Exhibit A, the Executable Form of such Source Code
|
||||
Form, and Modifications of such Source Code Form, in each case
|
||||
including portions thereof.
|
||||
|
||||
1.5. "Incompatible With Secondary Licenses"
|
||||
means
|
||||
|
||||
(a) that the initial Contributor has attached the notice described
|
||||
in Exhibit B to the Covered Software; or
|
||||
|
||||
(b) that the Covered Software was made available under the terms of
|
||||
version 1.1 or earlier of the License, but not also under the
|
||||
terms of a Secondary License.
|
||||
|
||||
1.6. "Executable Form"
|
||||
means any form of the work other than Source Code Form.
|
||||
|
||||
1.7. "Larger Work"
|
||||
means a work that combines Covered Software with other material, in
|
||||
a separate file or files, that is not Covered Software.
|
||||
|
||||
1.8. "License"
|
||||
means this document.
|
||||
|
||||
1.9. "Licensable"
|
||||
means having the right to grant, to the maximum extent possible,
|
||||
whether at the time of the initial grant or subsequently, any and
|
||||
all of the rights conveyed by this License.
|
||||
|
||||
1.10. "Modifications"
|
||||
means any of the following:
|
||||
|
||||
(a) any file in Source Code Form that results from an addition to,
|
||||
deletion from, or modification of the contents of Covered
|
||||
Software; or
|
||||
|
||||
(b) any new file in Source Code Form that contains any Covered
|
||||
Software.
|
||||
|
||||
1.11. "Patent Claims" of a Contributor
|
||||
means any patent claim(s), including without limitation, method,
|
||||
process, and apparatus claims, in any patent Licensable by such
|
||||
Contributor that would be infringed, but for the grant of the
|
||||
License, by the making, using, selling, offering for sale, having
|
||||
made, import, or transfer of either its Contributions or its
|
||||
Contributor Version.
|
||||
|
||||
1.12. "Secondary License"
|
||||
means either the GNU General Public License, Version 2.0, the GNU
|
||||
Lesser General Public License, Version 2.1, the GNU Affero General
|
||||
Public License, Version 3.0, or any later versions of those
|
||||
licenses.
|
||||
|
||||
1.13. "Source Code Form"
|
||||
means the form of the work preferred for making modifications.
|
||||
|
||||
1.14. "You" (or "Your")
|
||||
means an individual or a legal entity exercising rights under this
|
||||
License. For legal entities, "You" includes any entity that
|
||||
controls, is controlled by, or is under common control with You. For
|
||||
purposes of this definition, "control" means (a) the power, direct
|
||||
or indirect, to cause the direction or management of such entity,
|
||||
whether by contract or otherwise, or (b) ownership of more than
|
||||
fifty percent (50%) of the outstanding shares or beneficial
|
||||
ownership of such entity.
|
||||
|
||||
2. License Grants and Conditions
|
||||
--------------------------------
|
||||
|
||||
2.1. Grants
|
||||
|
||||
Each Contributor hereby grants You a world-wide, royalty-free,
|
||||
non-exclusive license:
|
||||
|
||||
(a) under intellectual property rights (other than patent or trademark)
|
||||
Licensable by such Contributor to use, reproduce, make available,
|
||||
modify, display, perform, distribute, and otherwise exploit its
|
||||
Contributions, either on an unmodified basis, with Modifications, or
|
||||
as part of a Larger Work; and
|
||||
|
||||
(b) under Patent Claims of such Contributor to make, use, sell, offer
|
||||
for sale, have made, import, and otherwise transfer either its
|
||||
Contributions or its Contributor Version.
|
||||
|
||||
2.2. Effective Date
|
||||
|
||||
The licenses granted in Section 2.1 with respect to any Contribution
|
||||
become effective for each Contribution on the date the Contributor first
|
||||
distributes such Contribution.
|
||||
|
||||
2.3. Limitations on Grant Scope
|
||||
|
||||
The licenses granted in this Section 2 are the only rights granted under
|
||||
this License. No additional rights or licenses will be implied from the
|
||||
distribution or licensing of Covered Software under this License.
|
||||
Notwithstanding Section 2.1(b) above, no patent license is granted by a
|
||||
Contributor:
|
||||
|
||||
(a) for any code that a Contributor has removed from Covered Software;
|
||||
or
|
||||
|
||||
(b) for infringements caused by: (i) Your and any other third party's
|
||||
modifications of Covered Software, or (ii) the combination of its
|
||||
Contributions with other software (except as part of its Contributor
|
||||
Version); or
|
||||
|
||||
(c) under Patent Claims infringed by Covered Software in the absence of
|
||||
its Contributions.
|
||||
|
||||
This License does not grant any rights in the trademarks, service marks,
|
||||
or logos of any Contributor (except as may be necessary to comply with
|
||||
the notice requirements in Section 3.4).
|
||||
|
||||
2.4. Subsequent Licenses
|
||||
|
||||
No Contributor makes additional grants as a result of Your choice to
|
||||
distribute the Covered Software under a subsequent version of this
|
||||
License (see Section 10.2) or under the terms of a Secondary License (if
|
||||
permitted under the terms of Section 3.3).
|
||||
|
||||
2.5. Representation
|
||||
|
||||
Each Contributor represents that the Contributor believes its
|
||||
Contributions are its original creation(s) or it has sufficient rights
|
||||
to grant the rights to its Contributions conveyed by this License.
|
||||
|
||||
2.6. Fair Use
|
||||
|
||||
This License is not intended to limit any rights You have under
|
||||
applicable copyright doctrines of fair use, fair dealing, or other
|
||||
equivalents.
|
||||
|
||||
2.7. Conditions
|
||||
|
||||
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
|
||||
in Section 2.1.
|
||||
|
||||
3. Responsibilities
|
||||
-------------------
|
||||
|
||||
3.1. Distribution of Source Form
|
||||
|
||||
All distribution of Covered Software in Source Code Form, including any
|
||||
Modifications that You create or to which You contribute, must be under
|
||||
the terms of this License. You must inform recipients that the Source
|
||||
Code Form of the Covered Software is governed by the terms of this
|
||||
License, and how they can obtain a copy of this License. You may not
|
||||
attempt to alter or restrict the recipients' rights in the Source Code
|
||||
Form.
|
||||
|
||||
3.2. Distribution of Executable Form
|
||||
|
||||
If You distribute Covered Software in Executable Form then:
|
||||
|
||||
(a) such Covered Software must also be made available in Source Code
|
||||
Form, as described in Section 3.1, and You must inform recipients of
|
||||
the Executable Form how they can obtain a copy of such Source Code
|
||||
Form by reasonable means in a timely manner, at a charge no more
|
||||
than the cost of distribution to the recipient; and
|
||||
|
||||
(b) You may distribute such Executable Form under the terms of this
|
||||
License, or sublicense it under different terms, provided that the
|
||||
license for the Executable Form does not attempt to limit or alter
|
||||
the recipients' rights in the Source Code Form under this License.
|
||||
|
||||
3.3. Distribution of a Larger Work
|
||||
|
||||
You may create and distribute a Larger Work under terms of Your choice,
|
||||
provided that You also comply with the requirements of this License for
|
||||
the Covered Software. If the Larger Work is a combination of Covered
|
||||
Software with a work governed by one or more Secondary Licenses, and the
|
||||
Covered Software is not Incompatible With Secondary Licenses, this
|
||||
License permits You to additionally distribute such Covered Software
|
||||
under the terms of such Secondary License(s), so that the recipient of
|
||||
the Larger Work may, at their option, further distribute the Covered
|
||||
Software under the terms of either this License or such Secondary
|
||||
License(s).
|
||||
|
||||
3.4. Notices
|
||||
|
||||
You may not remove or alter the substance of any license notices
|
||||
(including copyright notices, patent notices, disclaimers of warranty,
|
||||
or limitations of liability) contained within the Source Code Form of
|
||||
the Covered Software, except that You may alter any license notices to
|
||||
the extent required to remedy known factual inaccuracies.
|
||||
|
||||
3.5. Application of Additional Terms
|
||||
|
||||
You may choose to offer, and to charge a fee for, warranty, support,
|
||||
indemnity or liability obligations to one or more recipients of Covered
|
||||
Software. However, You may do so only on Your own behalf, and not on
|
||||
behalf of any Contributor. You must make it absolutely clear that any
|
||||
such warranty, support, indemnity, or liability obligation is offered by
|
||||
You alone, and You hereby agree to indemnify every Contributor for any
|
||||
liability incurred by such Contributor as a result of warranty, support,
|
||||
indemnity or liability terms You offer. You may include additional
|
||||
disclaimers of warranty and limitations of liability specific to any
|
||||
jurisdiction.
|
||||
|
||||
4. Inability to Comply Due to Statute or Regulation
|
||||
---------------------------------------------------
|
||||
|
||||
If it is impossible for You to comply with any of the terms of this
|
||||
License with respect to some or all of the Covered Software due to
|
||||
statute, judicial order, or regulation then You must: (a) comply with
|
||||
the terms of this License to the maximum extent possible; and (b)
|
||||
describe the limitations and the code they affect. Such description must
|
||||
be placed in a text file included with all distributions of the Covered
|
||||
Software under this License. Except to the extent prohibited by statute
|
||||
or regulation, such description must be sufficiently detailed for a
|
||||
recipient of ordinary skill to be able to understand it.
|
||||
|
||||
5. Termination
|
||||
--------------
|
||||
|
||||
5.1. The rights granted under this License will terminate automatically
|
||||
if You fail to comply with any of its terms. However, if You become
|
||||
compliant, then the rights granted under this License from a particular
|
||||
Contributor are reinstated (a) provisionally, unless and until such
|
||||
Contributor explicitly and finally terminates Your grants, and (b) on an
|
||||
ongoing basis, if such Contributor fails to notify You of the
|
||||
non-compliance by some reasonable means prior to 60 days after You have
|
||||
come back into compliance. Moreover, Your grants from a particular
|
||||
Contributor are reinstated on an ongoing basis if such Contributor
|
||||
notifies You of the non-compliance by some reasonable means, this is the
|
||||
first time You have received notice of non-compliance with this License
|
||||
from such Contributor, and You become compliant prior to 30 days after
|
||||
Your receipt of the notice.
|
||||
|
||||
5.2. If You initiate litigation against any entity by asserting a patent
|
||||
infringement claim (excluding declaratory judgment actions,
|
||||
counter-claims, and cross-claims) alleging that a Contributor Version
|
||||
directly or indirectly infringes any patent, then the rights granted to
|
||||
You by any and all Contributors for the Covered Software under Section
|
||||
2.1 of this License shall terminate.
|
||||
|
||||
5.3. In the event of termination under Sections 5.1 or 5.2 above, all
|
||||
end user license agreements (excluding distributors and resellers) which
|
||||
have been validly granted by You or Your distributors under this License
|
||||
prior to termination shall survive termination.
|
||||
|
||||
************************************************************************
|
||||
* *
|
||||
* 6. Disclaimer of Warranty *
|
||||
* ------------------------- *
|
||||
* *
|
||||
* Covered Software is provided under this License on an "as is" *
|
||||
* basis, without warranty of any kind, either expressed, implied, or *
|
||||
* statutory, including, without limitation, warranties that the *
|
||||
* Covered Software is free of defects, merchantable, fit for a *
|
||||
* particular purpose or non-infringing. The entire risk as to the *
|
||||
* quality and performance of the Covered Software is with You. *
|
||||
* Should any Covered Software prove defective in any respect, You *
|
||||
* (not any Contributor) assume the cost of any necessary servicing, *
|
||||
* repair, or correction. This disclaimer of warranty constitutes an *
|
||||
* essential part of this License. No use of any Covered Software is *
|
||||
* authorized under this License except under this disclaimer. *
|
||||
* *
|
||||
************************************************************************
|
||||
|
||||
************************************************************************
|
||||
* *
|
||||
* 7. Limitation of Liability *
|
||||
* -------------------------- *
|
||||
* *
|
||||
* Under no circumstances and under no legal theory, whether tort *
|
||||
* (including negligence), contract, or otherwise, shall any *
|
||||
* Contributor, or anyone who distributes Covered Software as *
|
||||
* permitted above, be liable to You for any direct, indirect, *
|
||||
* special, incidental, or consequential damages of any character *
|
||||
* including, without limitation, damages for lost profits, loss of *
|
||||
* goodwill, work stoppage, computer failure or malfunction, or any *
|
||||
* and all other commercial damages or losses, even if such party *
|
||||
* shall have been informed of the possibility of such damages. This *
|
||||
* limitation of liability shall not apply to liability for death or *
|
||||
* personal injury resulting from such party's negligence to the *
|
||||
* extent applicable law prohibits such limitation. Some *
|
||||
* jurisdictions do not allow the exclusion or limitation of *
|
||||
* incidental or consequential damages, so this exclusion and *
|
||||
* limitation may not apply to You. *
|
||||
* *
|
||||
************************************************************************
|
||||
|
||||
8. Litigation
|
||||
-------------
|
||||
|
||||
Any litigation relating to this License may be brought only in the
|
||||
courts of a jurisdiction where the defendant maintains its principal
|
||||
place of business and such litigation shall be governed by laws of that
|
||||
jurisdiction, without reference to its conflict-of-law provisions.
|
||||
Nothing in this Section shall prevent a party's ability to bring
|
||||
cross-claims or counter-claims.
|
||||
|
||||
9. Miscellaneous
|
||||
----------------
|
||||
|
||||
This License represents the complete agreement concerning the subject
|
||||
matter hereof. If any provision of this License is held to be
|
||||
unenforceable, such provision shall be reformed only to the extent
|
||||
necessary to make it enforceable. Any law or regulation which provides
|
||||
that the language of a contract shall be construed against the drafter
|
||||
shall not be used to construe this License against a Contributor.
|
||||
|
||||
10. Versions of the License
|
||||
---------------------------
|
||||
|
||||
10.1. New Versions
|
||||
|
||||
Mozilla Foundation is the license steward. Except as provided in Section
|
||||
10.3, no one other than the license steward has the right to modify or
|
||||
publish new versions of this License. Each version will be given a
|
||||
distinguishing version number.
|
||||
|
||||
10.2. Effect of New Versions
|
||||
|
||||
You may distribute the Covered Software under the terms of the version
|
||||
of the License under which You originally received the Covered Software,
|
||||
or under the terms of any subsequent version published by the license
|
||||
steward.
|
||||
|
||||
10.3. Modified Versions
|
||||
|
||||
If you create software not governed by this License, and you want to
|
||||
create a new license for such software, you may create and use a
|
||||
modified version of this License if you rename the license and remove
|
||||
any references to the name of the license steward (except to note that
|
||||
such modified license differs from this License).
|
||||
|
||||
10.4. Distributing Source Code Form that is Incompatible With Secondary
|
||||
Licenses
|
||||
|
||||
If You choose to distribute Source Code Form that is Incompatible With
|
||||
Secondary Licenses under the terms of this version of the License, the
|
||||
notice described in Exhibit B of this License must be attached.
|
||||
|
||||
Exhibit A - Source Code Form License Notice
|
||||
-------------------------------------------
|
||||
|
||||
This Source Code Form is subject to the terms of the Mozilla Public
|
||||
License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
If it is not possible or desirable to put the notice in a particular
|
||||
file, then You may include the notice in a location (such as a LICENSE
|
||||
file in a relevant directory) where a recipient would be likely to look
|
||||
for such a notice.
|
||||
|
||||
You may add additional accurate notices of copyright ownership.
|
||||
|
||||
Exhibit B - "Incompatible With Secondary Licenses" Notice
|
||||
---------------------------------------------------------
|
||||
|
||||
This Source Code Form is "Incompatible With Secondary Licenses", as
|
||||
defined by the Mozilla Public License, v. 2.0.
|
||||
268
README.md
Normal file
268
README.md
Normal file
|
|
@ -0,0 +1,268 @@
|
|||
# csv-sanity
|
||||
|
||||
Preserve your sanity is a world full of malformed, poorly validated CSV files.
|
||||
Sanitize and transform large CSVs with millions of records quickly and
|
||||
efficiently.
|
||||
|
||||
**NOTE:** csv-sanity is in an alpha state and is subject to breaking changes.
|
||||
The ruleset file syntax in particular is likely to change in the near future.
|
||||
I've personally used csv-sanity on a number of projects and it has been
|
||||
incredibly helpful, but as with most alpha software csv-sanity is provided
|
||||
as-is and provides no warranty or guarantee. Use at your own risk and double
|
||||
check your transformed files!
|
||||
|
||||
## Purpose
|
||||
|
||||
The CSV format is not well-standardized and has many shortfalls when it comes to
|
||||
storing large numbers of records with complex data formats, but CSVs are
|
||||
ubiquitous in many realms as a neutral interchange format that most CRMs and
|
||||
database software can parse and understand.
|
||||
|
||||
But what happens when your CRM can only parse ISO 8601 formatted dates and the
|
||||
CSV you inherited has dates in another format such as the following:
|
||||
|
||||
```csv
|
||||
id,name,signup_date
|
||||
2,John Doe,11/22/2017
|
||||
3,Jane Doe,11/28/2017
|
||||
```
|
||||
|
||||
Or you received a CSV of people who you need to contact via a personalized
|
||||
email, but your contacts' names in the CSV are in ALL CAPS:
|
||||
|
||||
```csv
|
||||
id,first_name,last_name
|
||||
2,JOHN,DOE
|
||||
3,JANE,DOE
|
||||
```
|
||||
|
||||
Or you have a CSV that has valid values for the vast majority of records, but 1
|
||||
out of every 20k records has nonsense values that cause your entire import to
|
||||
abort:
|
||||
|
||||
```csv
|
||||
id,fist_name,last_name,party_registration
|
||||
2,Jane,Doe,REP
|
||||
3,John,Doe,DEM
|
||||
345,Josh,Smith,HAHAHAHA
|
||||
```
|
||||
|
||||
Or even a CSV that has a few malformed records due to unescaped commas:
|
||||
|
||||
```csv
|
||||
id,first_name,last_name,email
|
||||
2,Jane,Doe,jane@example.com
|
||||
3,John,Doe,"i,don't,follow,the,rules"@example.com
|
||||
```
|
||||
|
||||
These are all real problems I've encountered with CSVs over the years. If the
|
||||
CSV is small enough they can be corrected by hand, but for CSVs with 10k, 100k
|
||||
or even millions of records correcting by hand simply isn't a viable option.
|
||||
|
||||
`csv-sanity` aims to solve the issue of sanitizing large, poorly-validated CSVs.
|
||||
|
||||
## Usage
|
||||
|
||||
`csv-sanity` is an executable that takes an input CSV to process and a JSON
|
||||
ruleset file defining the transformation rules to apply:
|
||||
|
||||
```bash
|
||||
csv-sanity [-r RULESET_FILE] <INPUT_FILE>
|
||||
```
|
||||
|
||||
If a path to a ruleset file is not provided via the `-r` option, `csv-sanity`
|
||||
will look for a file named "ruleset.json" in the current directory.
|
||||
|
||||
By default, `csv-sanity` outputs two files to the current directory:
|
||||
output.csv, which contains the processed CSV with validated and transformed
|
||||
records, and errors.csv, which contains a list of records and fields that
|
||||
couldn't be processed and reasons they were rejected. The paths where the output
|
||||
and error files are output can be overridden via the `-o FILE_PATH` and
|
||||
`-e FILE_PATH` options, respectively.
|
||||
|
||||
## ruleset.json Syntax
|
||||
|
||||
Ruleset files are JSON files that define a collection of transformation rules
|
||||
and the fields to which they should be applied.
|
||||
|
||||
The following is an example ruleset JSON file:
|
||||
|
||||
```json
|
||||
{
|
||||
"rules": [
|
||||
{
|
||||
"applicability": {
|
||||
"Global": [],
|
||||
},
|
||||
"transformer": {
|
||||
"None": {
|
||||
"regex": "\\A(?:[:cntrl:]|\\s)*\\z"
|
||||
}
|
||||
},
|
||||
"priority": -10
|
||||
},
|
||||
{
|
||||
"applicability": {
|
||||
"Global": [],
|
||||
},
|
||||
"transformer": {
|
||||
"Trim": {}
|
||||
},
|
||||
"priority": -10
|
||||
},
|
||||
{
|
||||
"applicability": {
|
||||
"Fields": {
|
||||
"field_names": [
|
||||
"first_name",
|
||||
"last_name"
|
||||
]
|
||||
}
|
||||
},
|
||||
"transformer": {
|
||||
"Capitalize": {}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Every ruleset.json file is a JSON object with a single "rules" field with an
|
||||
array of rule objects.
|
||||
|
||||
Rules are objects with two fields:
|
||||
|
||||
- **"applicability"**: specifies whether a rule applies globally or only to a
|
||||
predefined set of fields (specified as the column headers in the CSV being
|
||||
processed)
|
||||
- **"transformer"**: a transformer object, which specifies how the applicable
|
||||
fields should be transformed.
|
||||
|
||||
### Transformers
|
||||
|
||||
|
||||
|
||||
#### Capitalize
|
||||
|
||||
Transforms string fields into Capital Case.
|
||||
|
||||
#### Choice
|
||||
|
||||
Only accepts a pre-defined list of acceptable values and rejects the rest.
|
||||
|
||||
#### Date
|
||||
|
||||
```json
|
||||
{
|
||||
"Date": {
|
||||
"input_formats": [
|
||||
"%m/%d/%Y"
|
||||
],
|
||||
"output_formats": "%F"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Attempt to parse fields with a list of datetime formats via
|
||||
[time::strptime](https://docs.rs/time/0.1.37/time/fn.strptime.html). See the
|
||||
docs for the [time](https://docs.rs/time/0.1.37/time/index.html) crate for
|
||||
details on datetime formating syntax.
|
||||
|
||||
#### Email
|
||||
|
||||
```json
|
||||
{
|
||||
"Email": {}
|
||||
}
|
||||
```
|
||||
|
||||
Attempt to parse fields as email addresses, rejecting any fields that appear to
|
||||
be invalid email addresses.
|
||||
|
||||
#### None
|
||||
|
||||
```json
|
||||
{
|
||||
"None": {
|
||||
"regex": "\\A(?:[:cntrl:]|\\s)*\\z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Replace matched fields with a blank value. Useful as a global rule for
|
||||
normalizing blank fields in a CSV file.
|
||||
|
||||
#### Number
|
||||
|
||||
```json
|
||||
{
|
||||
"Number": {}
|
||||
}
|
||||
```
|
||||
|
||||
Attempt to parse fields as whole integers, rejecting any fields that cannot be
|
||||
parsed.
|
||||
|
||||
#### PhoneNumber
|
||||
|
||||
```json
|
||||
{
|
||||
"PhoneNumber": {}
|
||||
}
|
||||
```
|
||||
|
||||
Attempt to parse files as US, NANP-formatted phone numbers, transforming them
|
||||
into a standard international format of `+1 <area_code> <exchange_code> <subscriber_number>`.
|
||||
|
||||
#### Regex
|
||||
|
||||
```json
|
||||
{
|
||||
"Regex": {
|
||||
"regex": "\\A([A-Z])[A-Z]+\\z",
|
||||
"template": "$1"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Match fields against the provided regex pattern and transform them according to
|
||||
the template string, replacing capture groups placeholders. See the
|
||||
[Regex::replace](https://docs.rs/regex/0.2.1/regex/struct.Regex.html#method.replace)
|
||||
in the regex crate docs for details.
|
||||
|
||||
#### RegexMatch
|
||||
|
||||
```json
|
||||
{
|
||||
"RegexMatch": {
|
||||
"regex": "\\A[A-Z]{2,3}\\z",
|
||||
"negate": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Reject any fields that fail to match against the provided regex pattern. If
|
||||
`negate` is `true`, the reject any fields that match the provided regex pattern
|
||||
instead.
|
||||
|
||||
#### Trim
|
||||
|
||||
```json
|
||||
{
|
||||
"Trim": {}
|
||||
}
|
||||
```
|
||||
|
||||
Trim leading and trailing whitespace from fields. Useful as a global rule to
|
||||
normalize fields and remove useless whitespace.
|
||||
|
||||
#### Zipcode
|
||||
|
||||
```json
|
||||
{
|
||||
"Zipcode": {}
|
||||
}
|
||||
```
|
||||
|
||||
Attempt to parse fields as US zip codes in the formats "xxxxx" and "xxxxx-xxxx",
|
||||
rejecting any fields that fail to match that format.
|
||||
193
src/cli.rs
Normal file
193
src/cli.rs
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
//! Command line interface.
|
||||
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
|
||||
use {
|
||||
Ruleset,
|
||||
TransformError,
|
||||
TransformedRecord,
|
||||
};
|
||||
|
||||
use csv;
|
||||
|
||||
/// Configuration options for the `Cli`.
|
||||
pub struct Options
|
||||
{
|
||||
/// See `CsvOptions`.
|
||||
pub csv_options: CsvOptions,
|
||||
}
|
||||
|
||||
impl Default for Options {
|
||||
fn default() -> Options {
|
||||
Options {
|
||||
csv_options: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `Cli` configuration options specific to how to parse the CSV file.
|
||||
///
|
||||
/// `CsvOptions` implements `Default` with the following defaults:
|
||||
///
|
||||
/// ```
|
||||
/// extern crate csv;
|
||||
/// use csv_sanity::cli::CsvOptions;
|
||||
/// use csv::RecordTerminator;
|
||||
///
|
||||
/// let defaults = CsvOptions {
|
||||
/// delimiter: b',',
|
||||
/// record_terminator: csv::RecordTerminator::CRLF,
|
||||
/// quote: b'"',
|
||||
/// escape: None,
|
||||
/// double_quote: true,
|
||||
/// };
|
||||
/// assert_eq!(defaults, Default::default());
|
||||
/// ```
|
||||
pub struct CsvOptions
|
||||
{
|
||||
/// Field delimeter to expect in the CSV file.
|
||||
///
|
||||
/// Corresponds to the `csv::Reader.delimiter` method.
|
||||
pub delimiter: u8,
|
||||
/// Record terminator to expect in the CSV file.
|
||||
///
|
||||
/// Corresponds to the `csv::Reader.record_terminator` method. See `csv::RecordTerminator`.
|
||||
pub record_terminator: csv::RecordTerminator,
|
||||
/// Field quotation character to expect in the CSV file.
|
||||
///
|
||||
/// Corresponds to the `csv::Reader.quote` method.
|
||||
pub quote: u8,
|
||||
/// Escape character to expect in the CSV file.
|
||||
///
|
||||
/// Corresponds to the `csv::Reader.escape` method.
|
||||
pub escape: Option<u8>,
|
||||
/// Whether two adjacent quote characters should be interpreted as an escaped quote character.
|
||||
///
|
||||
/// Corresponds to the `csv::Reader.double_quote` method.
|
||||
pub double_quote: bool
|
||||
}
|
||||
|
||||
impl Default for CsvOptions
|
||||
{
|
||||
fn default() -> CsvOptions {
|
||||
CsvOptions {
|
||||
delimiter: b',',
|
||||
record_terminator: csv::RecordTerminator::CRLF,
|
||||
quote: b'"',
|
||||
escape: None,
|
||||
double_quote: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Command line interface for running a `Ruleset` against a CSV file.
|
||||
pub struct Cli
|
||||
{
|
||||
options: Options,
|
||||
ruleset: Ruleset,
|
||||
}
|
||||
|
||||
impl Cli
|
||||
{
|
||||
/// Construct a new `Cli` with default options.
|
||||
///
|
||||
/// ```
|
||||
/// use csv_sanity::Ruleset;
|
||||
/// use csv_sanity::cli::{
|
||||
/// Cli
|
||||
/// };
|
||||
///
|
||||
/// let ruleset = Ruleset::new();
|
||||
/// let cli = Cli::new(ruleset);
|
||||
/// ```
|
||||
pub fn new(ruleset: Ruleset) -> Cli {
|
||||
Self::new_with_options(ruleset, Default::default())
|
||||
}
|
||||
|
||||
/// Construct a new `Cli` with the specified options.
|
||||
///
|
||||
/// ```
|
||||
/// use csv_sanity::Ruleset;
|
||||
/// use csv_sanity::cli::{
|
||||
/// Cli,
|
||||
/// Options,
|
||||
/// CsvOptions
|
||||
/// };
|
||||
///
|
||||
/// let ruleset = Ruleset::new();
|
||||
/// let cli = Cli::new_with_options(ruleset, Options {
|
||||
/// csv_options: CsvOptions {
|
||||
/// delimiter: b',',
|
||||
/// .. Default::default()
|
||||
/// },
|
||||
/// .. Default::default()
|
||||
/// });
|
||||
/// ```
|
||||
pub fn new_with_options(ruleset: Ruleset, options: Options) -> Cli {
|
||||
Cli {
|
||||
options: options,
|
||||
ruleset: ruleset,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn run<I: AsRef<Path>, O: AsRef<Path>, E: AsRef<Path>>(&self, input_file_path: I, output_file_name: O, error_file_name: E) {
|
||||
let (mut reader, headers) = self.reader_from_file(input_file_path);
|
||||
|
||||
let mut output_writer = csv::Writer::from_file(output_file_name).expect("Unable to open output file for writing");
|
||||
let mut output_headers = headers.clone();
|
||||
output_headers.insert(0, "Record Number".to_string());
|
||||
output_writer.encode(output_headers).expect("Unable to write to output file");
|
||||
|
||||
let mut error_writer = csv::Writer::from_file(error_file_name).expect("Unable to open error file for writing");
|
||||
let error_headers = vec![
|
||||
"Record Number",
|
||||
"Field Name",
|
||||
"Field Value",
|
||||
"Reason",
|
||||
];
|
||||
error_writer.encode(error_headers).expect("Unable to write to error file");
|
||||
|
||||
for (record_n, record) in reader.records().enumerate() {
|
||||
let original_line_n = record_n + 2; // Plus one for headers and plus one for zero-indexing.
|
||||
let transformed_record: TransformedRecord = match record {
|
||||
Err(e) => {
|
||||
let err = TransformError {
|
||||
field_value: "".to_string(),
|
||||
field_name: "".to_string(),
|
||||
record_n: original_line_n,
|
||||
reason: format!("{}", e),
|
||||
};
|
||||
error_writer.encode(err).expect("Unable to write to error file");
|
||||
continue;
|
||||
},
|
||||
Ok(ref rec) => self.ruleset.apply_rules(&headers, rec, original_line_n)
|
||||
};
|
||||
let record_fields: Vec<Option<String>> = {
|
||||
let mut fs = vec![Some(original_line_n.to_string())];
|
||||
fs.extend(transformed_record.field_values);
|
||||
fs
|
||||
};
|
||||
output_writer.encode(record_fields).expect("Unable to write to output file");
|
||||
for error in transformed_record.errors {
|
||||
error_writer.encode(error).expect("Unable to write to error file");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn reader_from_file<P: AsRef<Path>>(&self, path: P) -> (csv::Reader<File>, Vec<String>) {
|
||||
let mut reader = csv::Reader::from_file(path.as_ref().clone()).map(|r| {
|
||||
// Configure the reader according to the options passed to the Cli constructor.
|
||||
r.has_headers(true)
|
||||
.delimiter(self.options.csv_options.delimiter)
|
||||
.record_terminator(self.options.csv_options.record_terminator)
|
||||
.quote(self.options.csv_options.quote)
|
||||
.escape(self.options.csv_options.escape)
|
||||
.double_quote(self.options.csv_options.double_quote)
|
||||
.flexible(true)
|
||||
}).expect(&format!("Unable to read file {}", path.as_ref().display()));
|
||||
let headers = reader.headers()
|
||||
.expect(&format!("Unable to read headers from input file {}", path.as_ref().display()));
|
||||
(reader, headers)
|
||||
}
|
||||
}
|
||||
36
src/lib.rs
Normal file
36
src/lib.rs
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
extern crate serde;
|
||||
extern crate serde_json;
|
||||
extern crate regex;
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
extern crate unicode_segmentation;
|
||||
extern crate time;
|
||||
extern crate csv;
|
||||
#[macro_use]
|
||||
extern crate custom_derive;
|
||||
#[macro_use]
|
||||
extern crate newtype_derive;
|
||||
extern crate rustc_serialize;
|
||||
|
||||
mod newtypes;
|
||||
|
||||
pub mod transformer;
|
||||
pub use transformer::{
|
||||
Transformer,
|
||||
TransformResult,
|
||||
TransformResultHelper,
|
||||
TransformError
|
||||
};
|
||||
|
||||
pub mod transformers;
|
||||
|
||||
mod ruleset;
|
||||
pub use ruleset::{
|
||||
Rule,
|
||||
Ruleset,
|
||||
TransformedRecord,
|
||||
};
|
||||
|
||||
pub mod cli;
|
||||
111
src/main.rs
Normal file
111
src/main.rs
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
extern crate csv_sanity;
|
||||
|
||||
extern crate serde_json;
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
extern crate regex;
|
||||
#[macro_use]
|
||||
extern crate clap;
|
||||
|
||||
use csv_sanity::cli::{
|
||||
self,
|
||||
Cli,
|
||||
};
|
||||
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use log::{
|
||||
LogRecord,
|
||||
LogLevel,
|
||||
LogMetadata,
|
||||
LogLevelFilter,
|
||||
SetLoggerError
|
||||
};
|
||||
use clap::{
|
||||
App,
|
||||
Arg
|
||||
};
|
||||
|
||||
struct ConsoleLogger {
|
||||
log_level: LogLevel
|
||||
}
|
||||
|
||||
impl log::Log for ConsoleLogger {
|
||||
fn enabled(&self, metadata: &LogMetadata) -> bool {
|
||||
metadata.level() <= self.log_level
|
||||
}
|
||||
|
||||
fn log(&self, record: &LogRecord) {
|
||||
if self.enabled(record.metadata()) {
|
||||
println!("{} - {}", record.level(), record.args())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn init_logging() -> Result<(), SetLoggerError> {
|
||||
log::set_logger(|max_log_level| {
|
||||
max_log_level.set(LogLevelFilter::Info);
|
||||
Box::new(ConsoleLogger { log_level: LogLevel::Info })
|
||||
})
|
||||
}
|
||||
|
||||
fn main() {
|
||||
init_logging().unwrap();
|
||||
|
||||
let matches = App::new("Convert CSV")
|
||||
.version(crate_version!())
|
||||
.author("M. George Hansen <technopolitica@gmail.com>")
|
||||
.about("Apply a set of transformations to the records in a CSV file, attempting to read a much valid information from the file as possible.")
|
||||
.arg(Arg::with_name("INPUT_FILE")
|
||||
.help("CSV file to process")
|
||||
.required(true)
|
||||
.index(1))
|
||||
.arg(Arg::with_name("output")
|
||||
.help("File to output the transformed CSV records. Defaults to ./output.csv")
|
||||
.short("o")
|
||||
.long("output")
|
||||
.takes_value(true))
|
||||
.arg(Arg::with_name("error_output")
|
||||
.help("File to output errors in CSV format. Defaults to ./errors.csv")
|
||||
.short("e")
|
||||
.long("error_output")
|
||||
.takes_value(true))
|
||||
.arg(Arg::with_name("ruleset")
|
||||
.help("JSON file containing the ruleset to apply. Defaults to ./ruleset.json")
|
||||
.short("r")
|
||||
.long("ruleset")
|
||||
.takes_value(true))
|
||||
.get_matches();
|
||||
|
||||
let ruleset_file_path = Path::new(matches.value_of("ruleset").unwrap_or("ruleset.json"));
|
||||
let ruleset_file = match File::open(ruleset_file_path) {
|
||||
Ok(f) => f,
|
||||
Err(e) => exit_with_error(&format!("unable to read ruleset file {}: {}", ruleset_file_path.display(), e))
|
||||
};
|
||||
let ruleset = match serde_json::from_reader(ruleset_file) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
exit_with_error(&format!("failed to parse ruleset from {}: {}", ruleset_file_path.display(), e));
|
||||
}
|
||||
};
|
||||
|
||||
let cli_app = Cli::new_with_options(ruleset, cli::Options {
|
||||
csv_options: cli::CsvOptions {
|
||||
delimiter: b'\t',
|
||||
.. Default::default()
|
||||
},
|
||||
.. Default::default()
|
||||
});
|
||||
|
||||
// NOTE: Required arguments are validated by clap, so we should be safe to use expect here.
|
||||
let input_file_name = matches.value_of("INPUT_FILE").expect("INPUT_FILE argument could not be found!");
|
||||
let output_file_name = matches.value_of("output_file").unwrap_or("output.csv");
|
||||
let error_file_name = matches.value_of("error_file").unwrap_or("errors.csv");
|
||||
cli_app.run(input_file_name, output_file_name, error_file_name);
|
||||
}
|
||||
|
||||
fn exit_with_error(error_msg: &str) -> !
|
||||
{
|
||||
error!("{}", error_msg);
|
||||
std::process::exit(1);
|
||||
}
|
||||
59
src/newtypes.rs
Normal file
59
src/newtypes.rs
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
use std::hash::{
|
||||
Hash,
|
||||
Hasher,
|
||||
};
|
||||
use regex;
|
||||
use serde::{
|
||||
Serialize,
|
||||
Serializer,
|
||||
Deserialize,
|
||||
Deserializer,
|
||||
};
|
||||
|
||||
custom_derive! {
|
||||
#[derive(NewtypeFrom, NewtypeDeref, NewtypeDerefMut, Clone, NewtypeDisplay, NewtypeDebug)]
|
||||
pub struct Regex(regex::Regex);
|
||||
}
|
||||
|
||||
impl PartialEq for Regex {
|
||||
fn eq(&self, other: &Regex) -> bool
|
||||
{
|
||||
self.0.as_str() == other.0.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Regex {}
|
||||
|
||||
impl Hash for Regex {
|
||||
fn hash<H>(&self, state: &mut H)
|
||||
where H: Hasher {
|
||||
self.as_str().hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Regex
|
||||
{
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer {
|
||||
let Regex(ref regex) = *self;
|
||||
regex.as_str().serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Regex
|
||||
{
|
||||
fn deserialize<D>(deserializer: D) -> Result<Regex, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
{
|
||||
use serde::de::{Unexpected, Error};
|
||||
let string: Result<String, D::Error> = Deserialize::deserialize(deserializer);
|
||||
string.and_then(|s| {
|
||||
regex::Regex::new(&s)
|
||||
.map(|r| Regex(r))
|
||||
.map_err(|e| {
|
||||
let message: &str = &format!("invalid regex string: {}", e);
|
||||
D::Error::invalid_value(Unexpected::Str(&s), &message)
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
345
src/ruleset.rs
Normal file
345
src/ruleset.rs
Normal file
|
|
@ -0,0 +1,345 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResult,
|
||||
TransformError,
|
||||
};
|
||||
use transformers::{
|
||||
Transformers,
|
||||
TrimTransformer,
|
||||
NoneTransformer,
|
||||
};
|
||||
|
||||
use std::hash::{
|
||||
Hash,
|
||||
Hasher,
|
||||
};
|
||||
use std::iter::FromIterator;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{
|
||||
BinaryHeap,
|
||||
HashSet,
|
||||
};
|
||||
use std::error;
|
||||
use std::fmt::{
|
||||
self,
|
||||
Formatter,
|
||||
Display,
|
||||
};
|
||||
|
||||
/// Applicability of a `Rule` determining which CSV record's fields it can be applied to.
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub enum Applicability {
|
||||
/// Applicable to all CSV record fields.
|
||||
Global,
|
||||
/// Applicable to a subset of a CSV record's fields, specified by field name.
|
||||
Fields {
|
||||
field_names: HashSet<String>
|
||||
}
|
||||
}
|
||||
|
||||
impl Hash for Applicability {
|
||||
fn hash<H>(&self, state: &mut H)
|
||||
where H: Hasher {
|
||||
use self::Applicability::*;
|
||||
match *self {
|
||||
Global => (self as *const Applicability).hash(state), // FIXME: Is this the correct way to hash an empty enum variant?
|
||||
Fields { ref field_names } => field_names.iter().collect::<Vec<&String>>().hash(state)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn priority_is_default(priority: &isize) -> bool {
|
||||
priority == &0
|
||||
}
|
||||
|
||||
/// A `Transformer` paired with `Applicability` and a priority which can be applied to fields in a
|
||||
/// CSV record.
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct Rule
|
||||
{
|
||||
applicability: Applicability,
|
||||
transformer: Transformers,
|
||||
#[serde(default, skip_serializing_if="priority_is_default")]
|
||||
priority: isize
|
||||
}
|
||||
|
||||
impl Rule
|
||||
{
|
||||
/// Construct a new `Rule` whoe `Transformer` is applicable to one or more CSV record's fields
|
||||
/// referenced by name with the default priority of 0.
|
||||
///
|
||||
/// # Examples
|
||||
/// ```
|
||||
/// use csv_sanity::Rule;
|
||||
/// use csv_sanity::transformers::*;
|
||||
///
|
||||
/// let rule = Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize(
|
||||
/// CapitalizeTransformer::new()
|
||||
/// ));
|
||||
/// ```
|
||||
pub fn for_fields(field_names: &[&str], transformer: Transformers) -> Rule {
|
||||
Self::for_fields_with_priority(field_names, transformer, Default::default())
|
||||
}
|
||||
|
||||
/// Construct a new `Rule` whoe `Transformer` is applicable to one or more CSV record's fields
|
||||
/// referenced by name with the specified priority.
|
||||
///
|
||||
/// # Examples
|
||||
/// ```
|
||||
/// use csv_sanity::Rule;
|
||||
/// use csv_sanity::transformers::*;
|
||||
///
|
||||
/// let rule = Rule::for_fields_with_priority(&["Fist Name", "Last Name"], Transformers::Capitalize(
|
||||
/// CapitalizeTransformer::new()
|
||||
/// ), 10);
|
||||
/// ```
|
||||
pub fn for_fields_with_priority(field_names: &[&str], transformer: Transformers, priority: isize) -> Rule {
|
||||
Rule {
|
||||
applicability: Applicability::Fields { field_names: field_names.iter().map(|s| s.to_string()).collect() },
|
||||
transformer: transformer,
|
||||
priority: priority
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct a new `Rule` applicable to all of a CSV record's fields with the default priority
|
||||
/// of 0.
|
||||
///
|
||||
/// # Examples
|
||||
/// ```
|
||||
/// use csv_sanity::Rule;
|
||||
/// use csv_sanity::transformers::*;
|
||||
///
|
||||
/// let rule = Rule::global(Transformers::Capitalize(
|
||||
/// CapitalizeTransformer::new()
|
||||
/// ));
|
||||
/// ```
|
||||
pub fn global(transformer: Transformers) -> Rule {
|
||||
Self::global_with_priority(transformer, Default::default())
|
||||
}
|
||||
|
||||
/// Construct a new `Rule` applicable to all of a CSV record's fields with the specified
|
||||
/// priority.
|
||||
///
|
||||
/// # Examples
|
||||
/// ```
|
||||
/// use csv_sanity::Rule;
|
||||
/// use csv_sanity::transformers::*;
|
||||
///
|
||||
/// let rule = Rule::global_with_priority(Transformers::Capitalize(
|
||||
/// CapitalizeTransformer::new()
|
||||
/// ), 10);
|
||||
/// ```
|
||||
pub fn global_with_priority(transformer: Transformers, priority: isize) -> Rule {
|
||||
Rule {
|
||||
applicability: Applicability::Global,
|
||||
transformer: transformer,
|
||||
priority: priority
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply this rule to a CSV record's field, returning the resulting `TransformResult`.
|
||||
///
|
||||
/// # Examples
|
||||
/// ```
|
||||
/// use csv_sanity::Rule;
|
||||
/// use csv_sanity::transformers::*;
|
||||
///
|
||||
/// let field = "JOHN";
|
||||
/// let field_name = "First Name";
|
||||
///
|
||||
/// let rule = Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize(
|
||||
/// CapitalizeTransformer::new()
|
||||
/// ));
|
||||
/// rule.apply(field, field_name, 1);
|
||||
/// ```
|
||||
pub fn apply(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||
// XXX: Does the applicability check belong inside the apply method? Or should the caller
|
||||
// decide?
|
||||
match self.applicability {
|
||||
Applicability::Global => self.transformer.transform(field_value, field_name, record_n),
|
||||
Applicability::Fields { ref field_names } if field_names.contains(&field_name.to_string()) => {
|
||||
self.transformer.transform(field_value, field_name, record_n)
|
||||
},
|
||||
_ => Ok(Some(field_value.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Rule
|
||||
{
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.priority.cmp(&self.priority)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Rule
|
||||
{
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// An ordered set of `Rule`s sorted by priority.
|
||||
///
|
||||
/// # Examples
|
||||
/// ```
|
||||
/// use csv_sanity::{
|
||||
/// Ruleset,
|
||||
/// Rule,
|
||||
/// TransformedRecord,
|
||||
/// };
|
||||
/// use csv_sanity::transformers::*;
|
||||
/// let ruleset = {
|
||||
/// let mut r = Ruleset::new();
|
||||
/// r.add_rule(Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize(
|
||||
/// CapitalizeTransformer::new()
|
||||
/// )));
|
||||
/// r.add_rule(Rule::for_fields(&["Email"], Transformers::Email(
|
||||
/// EmailTransformer::new()
|
||||
/// )));
|
||||
/// r
|
||||
/// };
|
||||
/// let headers = vec!["Id", "First Name", "Last Name", "Email"].iter().map(|s| s.to_string()).collect();
|
||||
/// let record = vec!["1", " JOHN", "SNOW ", "\t JSNOW@EXAMPLE.COM "].iter().map(|s| s.to_string()).collect();
|
||||
/// let transformed_record = ruleset.apply_rules(&headers, &record, 1);
|
||||
/// assert_eq!(TransformedRecord {
|
||||
/// field_values: vec!["1", "John", "Snow", "jsnow@example.com"].iter().map(|s| Some(s.to_string())).collect(),
|
||||
/// errors: Vec::new(),
|
||||
/// }, transformed_record);
|
||||
/// ```
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct Ruleset {
|
||||
rules: BinaryHeap<Rule>
|
||||
}
|
||||
|
||||
impl Ruleset {
|
||||
/// Construct a new `Ruleset` with a default `NoneTransformer` and `TrimTransformer` global
|
||||
/// rules.
|
||||
///
|
||||
/// The default trim and none rules should be appropriate for most CSV files. For CSV files
|
||||
/// where these default rules are not desired use the `Ruleset::without_default_rules` method.
|
||||
pub fn new() -> Ruleset {
|
||||
let mut ruleset = Self::without_default_rules();
|
||||
// Add a default trim rule and blank rule to match empty fields.
|
||||
ruleset.add_rule(Rule::global_with_priority(Transformers::None(NoneTransformer::with_blank_matcher()), -10));
|
||||
ruleset.add_rule(Rule::global_with_priority(Transformers::Trim(TrimTransformer::new()), -10));
|
||||
ruleset
|
||||
}
|
||||
|
||||
/// Construct a new `Ruleset` without any of the default rules.
|
||||
pub fn without_default_rules() -> Ruleset {
|
||||
Ruleset {
|
||||
rules: BinaryHeap::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a `Rule` to the this ruleset.
|
||||
pub fn add_rule(&mut self, rule: Rule) {
|
||||
self.rules.push(rule);
|
||||
}
|
||||
|
||||
/// Validate this ruleset against a CSV file by comparing it's `Rule`s against the headers.
|
||||
pub fn validate_rules(&self, headers: &Vec<String>) -> Result<(), Vec<ValidationError>> {
|
||||
let mut errors = Vec::new();
|
||||
for rule in self.rules.iter() {
|
||||
if let Applicability::Fields { ref field_names } = rule.applicability {
|
||||
let header_set = HashSet::<String>::from_iter(headers.clone());
|
||||
let field_set = HashSet::<String>::from_iter(field_names.clone());
|
||||
let diff: HashSet<String> = field_set.difference(&header_set).cloned().collect();
|
||||
if diff.len() > 0 {
|
||||
// FIXME: We should have a better way to construct a ruleset that uses Result
|
||||
// instead of panic! here.
|
||||
errors.push(
|
||||
ValidationError {
|
||||
reason: format!("The following fields were not found in headers: '{:?}'", diff),
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
if errors.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(errors)
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply this `Ruleset` to a record from a CSV file.
|
||||
pub fn apply_rules(&self, headers: &Vec<String>, fields: &Vec<String>, record_n: usize) -> TransformedRecord {
|
||||
let expected_n_fields = headers.len();
|
||||
|
||||
let mut errors: Vec<TransformError> = Vec::new();
|
||||
let mut transformed_fields: Vec<Option<String>> = Vec::new();
|
||||
for (field_n, field_value) in fields.iter().enumerate() {
|
||||
if field_n < expected_n_fields {
|
||||
let field_name = &headers[field_n];
|
||||
let mut transformed_field_value = Some(field_value.clone());
|
||||
// Try each rule in order of priority and test to see if it is applicable.
|
||||
for rule in self.rules.iter() {
|
||||
let new_value = match transformed_field_value {
|
||||
Some(ref fv) => {
|
||||
let transform_result = rule.apply(fv, &field_name, record_n);
|
||||
match transform_result {
|
||||
Ok(tfv) => tfv,
|
||||
Err(e) => {
|
||||
errors.push(e);
|
||||
None
|
||||
}
|
||||
}
|
||||
},
|
||||
// The last transformer returned None, so we can short circuit and just
|
||||
// return None for the field value.
|
||||
None => break
|
||||
};
|
||||
transformed_field_value = new_value;
|
||||
}
|
||||
transformed_fields.insert(field_n, transformed_field_value);
|
||||
} else {
|
||||
errors.push(
|
||||
TransformError {
|
||||
field_value: field_value.to_string(),
|
||||
field_name: field_n.to_string(),
|
||||
record_n: record_n,
|
||||
reason: format!("found {} header fields but record had extra field at position {}", expected_n_fields, field_n)
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
TransformedRecord {
|
||||
field_values: transformed_fields,
|
||||
errors: errors,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Error for when a `Ruleset` does not validate against a CSV file.
|
||||
#[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Hash, Debug)]
|
||||
pub struct ValidationError {
|
||||
reason: String,
|
||||
}
|
||||
|
||||
impl Display for ValidationError
|
||||
{
|
||||
fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
|
||||
write!(formatter, "{}", self.reason)
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for ValidationError
|
||||
{
|
||||
fn description(&self) -> &str {
|
||||
&self.reason
|
||||
}
|
||||
}
|
||||
|
||||
/// A single processed and transformed record.
|
||||
#[derive(Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
|
||||
pub struct TransformedRecord {
|
||||
/// Transformed fields for the record.
|
||||
///
|
||||
/// Empty field are explicitly encoded as `None` values.
|
||||
pub field_values: Vec<Option<String>>,
|
||||
/// Errors that were encountered during transformation, if any.
|
||||
pub errors: Vec<TransformError>,
|
||||
}
|
||||
91
src/transformer.rs
Normal file
91
src/transformer.rs
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
//! Traits and types that define transformations on CSV record fields.
|
||||
|
||||
use std::result;
|
||||
use std::error;
|
||||
use std::fmt::{
|
||||
self,
|
||||
Formatter,
|
||||
Display,
|
||||
};
|
||||
|
||||
/// `Result` for the transformation of a CSV record's field, either an `Option<String>` if
|
||||
/// successfully transformed or a `TransformError` if unsuccessful.
|
||||
pub type TransformResult = result::Result<Option<String>, TransformError>;
|
||||
|
||||
/// Helper trait with a few useful utility methods for constructing `TransformResult`.
|
||||
pub trait TransformResultHelper
|
||||
{
|
||||
/// Construct a `TransformResult` that represents a successful transformation of a CSV record's
|
||||
/// field with a non-empty value.
|
||||
fn present(value: &str) -> TransformResult {
|
||||
Ok(Some(value.to_string()))
|
||||
}
|
||||
|
||||
/// Construct a `TransformResult` that represents a successful tranformation of a CSV record's
|
||||
/// field with an empty value.
|
||||
fn excluded() -> TransformResult {
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Construct a `TransformResult` that represents a failed transformation of a CSV record's
|
||||
/// field with a descritive error reason.
|
||||
///
|
||||
/// An error reason should be a short, single sentence without punctuation or capitization,
|
||||
/// e.g. "not a valid email address" instead of "The email address was invalid.".
|
||||
///
|
||||
/// ```
|
||||
/// use csv_sanity::transformer::{
|
||||
/// TransformResult,
|
||||
/// TransformError,
|
||||
/// TransformResultHelper,
|
||||
/// };
|
||||
///
|
||||
/// let result = TransformResult::error("jak,.@hot mail.com", "Email", 0, "not a valid email address");
|
||||
/// assert_eq!(result, Err(TransformError {
|
||||
/// field_value: "jak,.@hot mail.com".to_string(),
|
||||
/// field_name: "Email".to_string(),
|
||||
/// record_n: 0,
|
||||
/// reason: "not a valid email address".to_string(),
|
||||
/// }));
|
||||
/// ```
|
||||
fn error(field_value: &str, field_name: &str, record_n: usize, reason: &str) -> TransformResult {
|
||||
Err(
|
||||
TransformError {
|
||||
field_value: field_value.to_string(),
|
||||
field_name: field_name.to_string(),
|
||||
record_n: record_n,
|
||||
reason: reason.to_string(),
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl TransformResultHelper for TransformResult {}
|
||||
|
||||
pub trait Transformer
|
||||
{
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult;
|
||||
}
|
||||
|
||||
#[derive(RustcEncodable, Deserialize, Serialize, Clone, PartialEq, Eq, Hash, Debug)]
|
||||
pub struct TransformError
|
||||
{
|
||||
pub record_n: usize,
|
||||
pub field_name: String,
|
||||
pub field_value: String,
|
||||
pub reason: String,
|
||||
}
|
||||
|
||||
impl Display for TransformError
|
||||
{
|
||||
fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
|
||||
write!(formatter, "failed to transform field: {}", self.reason)
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for TransformError
|
||||
{
|
||||
fn description(&self) -> &str {
|
||||
&self.reason
|
||||
}
|
||||
}
|
||||
41
src/transformers/capitalize.rs
Normal file
41
src/transformers/capitalize.rs
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
pub fn capitalize(string: &str) -> String
|
||||
{
|
||||
string.unicode_words()
|
||||
.map(capitalize_word).collect::<Vec<String>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
||||
fn capitalize_word(word: &str) -> String
|
||||
{
|
||||
word.chars().enumerate()
|
||||
.map(|(i, c)| if i == 0 { c.to_uppercase().collect::<String>() } else { c.to_lowercase().collect() })
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct CapitalizeTransformer {}
|
||||
|
||||
impl CapitalizeTransformer
|
||||
{
|
||||
pub fn new() -> CapitalizeTransformer
|
||||
{
|
||||
CapitalizeTransformer {}
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for CapitalizeTransformer
|
||||
{
|
||||
fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult
|
||||
{
|
||||
let result = capitalize(field_value);
|
||||
TransformResult::present(&result)
|
||||
}
|
||||
}
|
||||
37
src/transformers/choice.rs
Normal file
37
src/transformers/choice.rs
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct ChoiceTransformer {
|
||||
choices: Vec<String>,
|
||||
}
|
||||
|
||||
impl ChoiceTransformer
|
||||
{
|
||||
pub fn new(choices: Vec<String>) -> ChoiceTransformer
|
||||
{
|
||||
ChoiceTransformer {
|
||||
choices: choices,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for ChoiceTransformer
|
||||
{
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult
|
||||
{
|
||||
if self.choices.contains(&field_value.to_string()) {
|
||||
TransformResult::present(&field_value)
|
||||
} else {
|
||||
TransformResult::error(
|
||||
field_value,
|
||||
field_name,
|
||||
record_n,
|
||||
&format!("not in valid choices {:?}", self.choices)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
41
src/transformers/date.rs
Normal file
41
src/transformers/date.rs
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
|
||||
use time::{
|
||||
strptime
|
||||
};
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct DateTransformer {
|
||||
input_formats: Vec<String>,
|
||||
output_format: String
|
||||
}
|
||||
|
||||
impl DateTransformer {
|
||||
pub fn new(input_formats: Vec<String>, output_format: &str) -> DateTransformer {
|
||||
DateTransformer {
|
||||
input_formats: input_formats,
|
||||
output_format: output_format.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_iso8601_output(input_formats: Vec<String>) -> DateTransformer {
|
||||
Self::new(input_formats, "%F")
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for DateTransformer {
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||
for format in self.input_formats.iter() {
|
||||
if let Ok(time) = strptime(field_value, &format) {
|
||||
return TransformResult::present(
|
||||
&format!("{}", time.strftime(&self.output_format).unwrap())
|
||||
);
|
||||
}
|
||||
}
|
||||
TransformResult::error(field_value, field_name, record_n, "unable to parse as date")
|
||||
}
|
||||
}
|
||||
30
src/transformers/email.rs
Normal file
30
src/transformers/email.rs
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
lazy_static! {
|
||||
static ref EMAIL_REGEX: Regex = Regex::new(r"(?i)\A[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\z").unwrap();
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct EmailTransformer {}
|
||||
|
||||
impl EmailTransformer {
|
||||
pub fn new() -> EmailTransformer {
|
||||
EmailTransformer {}
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for EmailTransformer {
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||
if EMAIL_REGEX.is_match(field_value) {
|
||||
TransformResult::present(&field_value.to_lowercase())
|
||||
} else {
|
||||
TransformResult::error(field_value, field_name, record_n, "invalid email address")
|
||||
}
|
||||
}
|
||||
}
|
||||
76
src/transformers/mod.rs
Normal file
76
src/transformers/mod.rs
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
use transformer::{
|
||||
Transformer,
|
||||
TransformResult,
|
||||
};
|
||||
|
||||
mod trim;
|
||||
pub use self::trim::TrimTransformer;
|
||||
|
||||
mod none;
|
||||
pub use self::none::NoneTransformer;
|
||||
|
||||
mod regex;
|
||||
pub use self::regex::{
|
||||
RegexTransformer,
|
||||
RegexMatchTransformer
|
||||
};
|
||||
|
||||
mod capitalize;
|
||||
pub use self::capitalize::{
|
||||
CapitalizeTransformer,
|
||||
capitalize
|
||||
};
|
||||
|
||||
mod email;
|
||||
pub use self::email::EmailTransformer;
|
||||
|
||||
mod number;
|
||||
pub use self::number::NumberTransformer;
|
||||
|
||||
mod date;
|
||||
pub use self::date::DateTransformer;
|
||||
|
||||
mod choice;
|
||||
pub use self::choice::ChoiceTransformer;
|
||||
|
||||
mod zipcode;
|
||||
pub use self::zipcode::ZipcodeTransformer;
|
||||
|
||||
mod phone_number;
|
||||
pub use self::phone_number::PhoneNumberTransformer;
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub enum Transformers
|
||||
{
|
||||
Trim(TrimTransformer),
|
||||
None(NoneTransformer),
|
||||
Regex(RegexTransformer),
|
||||
RegexMatch(RegexMatchTransformer),
|
||||
Capitalize(CapitalizeTransformer),
|
||||
Email(EmailTransformer),
|
||||
Number(NumberTransformer),
|
||||
Date(DateTransformer),
|
||||
Choice(ChoiceTransformer),
|
||||
Zipcode(ZipcodeTransformer),
|
||||
PhoneNumber(PhoneNumberTransformer),
|
||||
}
|
||||
|
||||
impl Transformer for Transformers {
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||
use self::Transformers::*;
|
||||
|
||||
match *self {
|
||||
Trim(ref t) => t.transform(field_value, field_name, record_n),
|
||||
None(ref t) => t.transform(field_value, field_name, record_n),
|
||||
Regex(ref t) => t.transform(field_value, field_name, record_n),
|
||||
RegexMatch(ref t) => t.transform(field_value, field_name, record_n),
|
||||
Capitalize(ref t) => t.transform(field_value, field_name, record_n),
|
||||
Email(ref t) => t.transform(field_value, field_name, record_n),
|
||||
Number(ref t) => t.transform(field_value, field_name, record_n),
|
||||
Date(ref t) => t.transform(field_value, field_name, record_n),
|
||||
Choice(ref t) => t.transform(field_value, field_name, record_n),
|
||||
Zipcode(ref t) => t.transform(field_value, field_name, record_n),
|
||||
PhoneNumber(ref t) => t.transform(field_value, field_name, record_n)
|
||||
}
|
||||
}
|
||||
}
|
||||
34
src/transformers/none.rs
Normal file
34
src/transformers/none.rs
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
use newtypes::Regex;
|
||||
|
||||
use regex;
|
||||
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct NoneTransformer {
|
||||
regex: Regex
|
||||
}
|
||||
|
||||
impl NoneTransformer {
|
||||
pub fn new(regex: regex::Regex) -> NoneTransformer {
|
||||
NoneTransformer { regex: Regex::from(regex) }
|
||||
}
|
||||
|
||||
pub fn with_blank_matcher() -> NoneTransformer {
|
||||
Self::new(regex::Regex::new(r"\A(?:[:cntrl:]|\s)*\z").unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for NoneTransformer {
|
||||
fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult {
|
||||
if self.regex.is_match(field_value) {
|
||||
TransformResult::excluded()
|
||||
} else {
|
||||
TransformResult::present(field_value)
|
||||
}
|
||||
}
|
||||
}
|
||||
30
src/transformers/number.rs
Normal file
30
src/transformers/number.rs
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
lazy_static! {
|
||||
static ref INTEGER_REGEX: Regex = Regex::new(r"\A(:?0|[1-9]\d*)\z").unwrap();
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct NumberTransformer { }
|
||||
|
||||
impl NumberTransformer {
|
||||
pub fn match_integer() -> NumberTransformer {
|
||||
NumberTransformer { }
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for NumberTransformer {
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||
if INTEGER_REGEX.is_match(field_value) {
|
||||
TransformResult::present(field_value)
|
||||
} else {
|
||||
TransformResult::error(field_value, field_name, record_n, "not a valid number")
|
||||
}
|
||||
}
|
||||
}
|
||||
34
src/transformers/phone_number.rs
Normal file
34
src/transformers/phone_number.rs
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
lazy_static! {
|
||||
static ref NANP_REGEX: Regex = Regex::new(r"\A(?:\+?1)?\D*\(?(?P<area>\d{3})\)?\D*(?P<exchange>\d{3})\D*(?P<subscriber>\d{4})\z").unwrap();
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct PhoneNumberTransformer { }
|
||||
|
||||
impl PhoneNumberTransformer {
|
||||
pub fn expect_nanp_format() -> PhoneNumberTransformer {
|
||||
PhoneNumberTransformer { }
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for PhoneNumberTransformer {
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||
if let Some(captures) = NANP_REGEX.captures(field_value) {
|
||||
let area_code = captures.name("area").unwrap().as_str();
|
||||
let exchange_code = captures.name("exchange").unwrap().as_str();
|
||||
let subscriber_number = captures.name("subscriber").unwrap().as_str();
|
||||
let phone_number = format!("+1 {} {} {}", area_code, exchange_code, subscriber_number);
|
||||
TransformResult::present(&phone_number)
|
||||
} else {
|
||||
TransformResult::error(field_value, field_name, record_n, "not a valid NANP format phone number")
|
||||
}
|
||||
}
|
||||
}
|
||||
88
src/transformers/regex.rs
Normal file
88
src/transformers/regex.rs
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
use newtypes::Regex;
|
||||
|
||||
use regex;
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct RegexTransformer
|
||||
{
|
||||
regex: Regex,
|
||||
template: String
|
||||
}
|
||||
|
||||
impl RegexTransformer
|
||||
{
|
||||
pub fn new(regex: regex::Regex, template: &str) -> RegexTransformer {
|
||||
RegexTransformer {
|
||||
regex: Regex::from(regex),
|
||||
template: template.to_string()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for RegexTransformer
|
||||
{
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||
if let Some(captures) = self.regex.captures(field_value) {
|
||||
let mut expansion = String::new();
|
||||
captures.expand(&self.template, &mut expansion);
|
||||
TransformResult::present(&expansion)
|
||||
} else {
|
||||
TransformResult::error(
|
||||
field_value,
|
||||
field_name,
|
||||
record_n,
|
||||
&format!("did not match pattern {}", self.regex)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct RegexMatchTransformer
|
||||
{
|
||||
regex: Regex,
|
||||
negate: bool
|
||||
}
|
||||
|
||||
impl RegexMatchTransformer
|
||||
{
|
||||
pub fn matching(regex: regex::Regex) -> RegexMatchTransformer {
|
||||
RegexMatchTransformer {
|
||||
regex: Regex::from(regex),
|
||||
negate: false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn not_matching(regex: regex::Regex) -> RegexMatchTransformer {
|
||||
RegexMatchTransformer {
|
||||
regex: Regex::from(regex),
|
||||
negate: true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for RegexMatchTransformer
|
||||
{
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||
let mut is_match = self.regex.is_match(field_value);
|
||||
if self.negate {
|
||||
is_match = !is_match;
|
||||
}
|
||||
|
||||
if is_match {
|
||||
TransformResult::present(field_value)
|
||||
} else {
|
||||
let reason = if self.negate {
|
||||
format!("matched exclusionary pattern {}", self.regex)
|
||||
} else {
|
||||
format!("did not match pattern {}", self.regex)
|
||||
};
|
||||
TransformResult::error(field_value, field_name, record_n, &reason)
|
||||
}
|
||||
}
|
||||
}
|
||||
20
src/transformers/trim.rs
Normal file
20
src/transformers/trim.rs
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct TrimTransformer {}
|
||||
|
||||
impl TrimTransformer {
|
||||
pub fn new() -> TrimTransformer {
|
||||
TrimTransformer {}
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for TrimTransformer {
|
||||
fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult {
|
||||
TransformResult::present(field_value.trim())
|
||||
}
|
||||
}
|
||||
37
src/transformers/zipcode.rs
Normal file
37
src/transformers/zipcode.rs
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
use Transformer;
|
||||
use transformer::{
|
||||
TransformResultHelper,
|
||||
TransformResult
|
||||
};
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
lazy_static! {
|
||||
static ref ZIP_REGEX: Regex = Regex::new(r"\A(\d{5})\D*(?:(\d{4}))?\z").unwrap();
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||
pub struct ZipcodeTransformer { }
|
||||
|
||||
impl ZipcodeTransformer {
|
||||
pub fn new() -> ZipcodeTransformer {
|
||||
ZipcodeTransformer { }
|
||||
}
|
||||
}
|
||||
|
||||
impl Transformer for ZipcodeTransformer {
|
||||
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||
if let Some(captures) = ZIP_REGEX.captures(field_value) {
|
||||
let base_code = captures.get(1).unwrap();
|
||||
let plus_four_code = captures.get(2);
|
||||
let zipcode = if let Some(pfc) = plus_four_code {
|
||||
format!("{}-{}", base_code.as_str(), pfc.as_str())
|
||||
} else {
|
||||
base_code.as_str().to_string()
|
||||
};
|
||||
TransformResult::present(&zipcode)
|
||||
} else {
|
||||
TransformResult::error(field_value, field_name, record_n, "not a valid zipcode")
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue