Initial commit
This commit is contained in:
commit
26c5433d16
21 changed files with 1968 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
target
|
||||||
|
Cargo.lock
|
||||||
22
Cargo.toml
Normal file
22
Cargo.toml
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
[package]
|
||||||
|
name = "csv-sanity"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["M. George Hansen <technopolitica@gmail.com>"]
|
||||||
|
license = "MPL-2.0"
|
||||||
|
maintenance = { status = "passively-maintained" }
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
csv = "0.15.0"
|
||||||
|
clap = "2.23.3"
|
||||||
|
log = "0.3.7"
|
||||||
|
regex = "0.2.1"
|
||||||
|
lazy_static = "0.2.8"
|
||||||
|
unicode-segmentation = "1.1.0"
|
||||||
|
time = "0.1.37"
|
||||||
|
maplit = "0.1.4"
|
||||||
|
serde = "1.0"
|
||||||
|
serde_derive = "1.0"
|
||||||
|
serde_json = "1.0"
|
||||||
|
custom_derive = "0.1.7"
|
||||||
|
newtype_derive = "0.1.6"
|
||||||
|
rustc-serialize = "0.3"
|
||||||
373
LICENSE
Normal file
373
LICENSE
Normal file
|
|
@ -0,0 +1,373 @@
|
||||||
|
Mozilla Public License Version 2.0
|
||||||
|
==================================
|
||||||
|
|
||||||
|
1. Definitions
|
||||||
|
--------------
|
||||||
|
|
||||||
|
1.1. "Contributor"
|
||||||
|
means each individual or legal entity that creates, contributes to
|
||||||
|
the creation of, or owns Covered Software.
|
||||||
|
|
||||||
|
1.2. "Contributor Version"
|
||||||
|
means the combination of the Contributions of others (if any) used
|
||||||
|
by a Contributor and that particular Contributor's Contribution.
|
||||||
|
|
||||||
|
1.3. "Contribution"
|
||||||
|
means Covered Software of a particular Contributor.
|
||||||
|
|
||||||
|
1.4. "Covered Software"
|
||||||
|
means Source Code Form to which the initial Contributor has attached
|
||||||
|
the notice in Exhibit A, the Executable Form of such Source Code
|
||||||
|
Form, and Modifications of such Source Code Form, in each case
|
||||||
|
including portions thereof.
|
||||||
|
|
||||||
|
1.5. "Incompatible With Secondary Licenses"
|
||||||
|
means
|
||||||
|
|
||||||
|
(a) that the initial Contributor has attached the notice described
|
||||||
|
in Exhibit B to the Covered Software; or
|
||||||
|
|
||||||
|
(b) that the Covered Software was made available under the terms of
|
||||||
|
version 1.1 or earlier of the License, but not also under the
|
||||||
|
terms of a Secondary License.
|
||||||
|
|
||||||
|
1.6. "Executable Form"
|
||||||
|
means any form of the work other than Source Code Form.
|
||||||
|
|
||||||
|
1.7. "Larger Work"
|
||||||
|
means a work that combines Covered Software with other material, in
|
||||||
|
a separate file or files, that is not Covered Software.
|
||||||
|
|
||||||
|
1.8. "License"
|
||||||
|
means this document.
|
||||||
|
|
||||||
|
1.9. "Licensable"
|
||||||
|
means having the right to grant, to the maximum extent possible,
|
||||||
|
whether at the time of the initial grant or subsequently, any and
|
||||||
|
all of the rights conveyed by this License.
|
||||||
|
|
||||||
|
1.10. "Modifications"
|
||||||
|
means any of the following:
|
||||||
|
|
||||||
|
(a) any file in Source Code Form that results from an addition to,
|
||||||
|
deletion from, or modification of the contents of Covered
|
||||||
|
Software; or
|
||||||
|
|
||||||
|
(b) any new file in Source Code Form that contains any Covered
|
||||||
|
Software.
|
||||||
|
|
||||||
|
1.11. "Patent Claims" of a Contributor
|
||||||
|
means any patent claim(s), including without limitation, method,
|
||||||
|
process, and apparatus claims, in any patent Licensable by such
|
||||||
|
Contributor that would be infringed, but for the grant of the
|
||||||
|
License, by the making, using, selling, offering for sale, having
|
||||||
|
made, import, or transfer of either its Contributions or its
|
||||||
|
Contributor Version.
|
||||||
|
|
||||||
|
1.12. "Secondary License"
|
||||||
|
means either the GNU General Public License, Version 2.0, the GNU
|
||||||
|
Lesser General Public License, Version 2.1, the GNU Affero General
|
||||||
|
Public License, Version 3.0, or any later versions of those
|
||||||
|
licenses.
|
||||||
|
|
||||||
|
1.13. "Source Code Form"
|
||||||
|
means the form of the work preferred for making modifications.
|
||||||
|
|
||||||
|
1.14. "You" (or "Your")
|
||||||
|
means an individual or a legal entity exercising rights under this
|
||||||
|
License. For legal entities, "You" includes any entity that
|
||||||
|
controls, is controlled by, or is under common control with You. For
|
||||||
|
purposes of this definition, "control" means (a) the power, direct
|
||||||
|
or indirect, to cause the direction or management of such entity,
|
||||||
|
whether by contract or otherwise, or (b) ownership of more than
|
||||||
|
fifty percent (50%) of the outstanding shares or beneficial
|
||||||
|
ownership of such entity.
|
||||||
|
|
||||||
|
2. License Grants and Conditions
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
2.1. Grants
|
||||||
|
|
||||||
|
Each Contributor hereby grants You a world-wide, royalty-free,
|
||||||
|
non-exclusive license:
|
||||||
|
|
||||||
|
(a) under intellectual property rights (other than patent or trademark)
|
||||||
|
Licensable by such Contributor to use, reproduce, make available,
|
||||||
|
modify, display, perform, distribute, and otherwise exploit its
|
||||||
|
Contributions, either on an unmodified basis, with Modifications, or
|
||||||
|
as part of a Larger Work; and
|
||||||
|
|
||||||
|
(b) under Patent Claims of such Contributor to make, use, sell, offer
|
||||||
|
for sale, have made, import, and otherwise transfer either its
|
||||||
|
Contributions or its Contributor Version.
|
||||||
|
|
||||||
|
2.2. Effective Date
|
||||||
|
|
||||||
|
The licenses granted in Section 2.1 with respect to any Contribution
|
||||||
|
become effective for each Contribution on the date the Contributor first
|
||||||
|
distributes such Contribution.
|
||||||
|
|
||||||
|
2.3. Limitations on Grant Scope
|
||||||
|
|
||||||
|
The licenses granted in this Section 2 are the only rights granted under
|
||||||
|
this License. No additional rights or licenses will be implied from the
|
||||||
|
distribution or licensing of Covered Software under this License.
|
||||||
|
Notwithstanding Section 2.1(b) above, no patent license is granted by a
|
||||||
|
Contributor:
|
||||||
|
|
||||||
|
(a) for any code that a Contributor has removed from Covered Software;
|
||||||
|
or
|
||||||
|
|
||||||
|
(b) for infringements caused by: (i) Your and any other third party's
|
||||||
|
modifications of Covered Software, or (ii) the combination of its
|
||||||
|
Contributions with other software (except as part of its Contributor
|
||||||
|
Version); or
|
||||||
|
|
||||||
|
(c) under Patent Claims infringed by Covered Software in the absence of
|
||||||
|
its Contributions.
|
||||||
|
|
||||||
|
This License does not grant any rights in the trademarks, service marks,
|
||||||
|
or logos of any Contributor (except as may be necessary to comply with
|
||||||
|
the notice requirements in Section 3.4).
|
||||||
|
|
||||||
|
2.4. Subsequent Licenses
|
||||||
|
|
||||||
|
No Contributor makes additional grants as a result of Your choice to
|
||||||
|
distribute the Covered Software under a subsequent version of this
|
||||||
|
License (see Section 10.2) or under the terms of a Secondary License (if
|
||||||
|
permitted under the terms of Section 3.3).
|
||||||
|
|
||||||
|
2.5. Representation
|
||||||
|
|
||||||
|
Each Contributor represents that the Contributor believes its
|
||||||
|
Contributions are its original creation(s) or it has sufficient rights
|
||||||
|
to grant the rights to its Contributions conveyed by this License.
|
||||||
|
|
||||||
|
2.6. Fair Use
|
||||||
|
|
||||||
|
This License is not intended to limit any rights You have under
|
||||||
|
applicable copyright doctrines of fair use, fair dealing, or other
|
||||||
|
equivalents.
|
||||||
|
|
||||||
|
2.7. Conditions
|
||||||
|
|
||||||
|
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
|
||||||
|
in Section 2.1.
|
||||||
|
|
||||||
|
3. Responsibilities
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
3.1. Distribution of Source Form
|
||||||
|
|
||||||
|
All distribution of Covered Software in Source Code Form, including any
|
||||||
|
Modifications that You create or to which You contribute, must be under
|
||||||
|
the terms of this License. You must inform recipients that the Source
|
||||||
|
Code Form of the Covered Software is governed by the terms of this
|
||||||
|
License, and how they can obtain a copy of this License. You may not
|
||||||
|
attempt to alter or restrict the recipients' rights in the Source Code
|
||||||
|
Form.
|
||||||
|
|
||||||
|
3.2. Distribution of Executable Form
|
||||||
|
|
||||||
|
If You distribute Covered Software in Executable Form then:
|
||||||
|
|
||||||
|
(a) such Covered Software must also be made available in Source Code
|
||||||
|
Form, as described in Section 3.1, and You must inform recipients of
|
||||||
|
the Executable Form how they can obtain a copy of such Source Code
|
||||||
|
Form by reasonable means in a timely manner, at a charge no more
|
||||||
|
than the cost of distribution to the recipient; and
|
||||||
|
|
||||||
|
(b) You may distribute such Executable Form under the terms of this
|
||||||
|
License, or sublicense it under different terms, provided that the
|
||||||
|
license for the Executable Form does not attempt to limit or alter
|
||||||
|
the recipients' rights in the Source Code Form under this License.
|
||||||
|
|
||||||
|
3.3. Distribution of a Larger Work
|
||||||
|
|
||||||
|
You may create and distribute a Larger Work under terms of Your choice,
|
||||||
|
provided that You also comply with the requirements of this License for
|
||||||
|
the Covered Software. If the Larger Work is a combination of Covered
|
||||||
|
Software with a work governed by one or more Secondary Licenses, and the
|
||||||
|
Covered Software is not Incompatible With Secondary Licenses, this
|
||||||
|
License permits You to additionally distribute such Covered Software
|
||||||
|
under the terms of such Secondary License(s), so that the recipient of
|
||||||
|
the Larger Work may, at their option, further distribute the Covered
|
||||||
|
Software under the terms of either this License or such Secondary
|
||||||
|
License(s).
|
||||||
|
|
||||||
|
3.4. Notices
|
||||||
|
|
||||||
|
You may not remove or alter the substance of any license notices
|
||||||
|
(including copyright notices, patent notices, disclaimers of warranty,
|
||||||
|
or limitations of liability) contained within the Source Code Form of
|
||||||
|
the Covered Software, except that You may alter any license notices to
|
||||||
|
the extent required to remedy known factual inaccuracies.
|
||||||
|
|
||||||
|
3.5. Application of Additional Terms
|
||||||
|
|
||||||
|
You may choose to offer, and to charge a fee for, warranty, support,
|
||||||
|
indemnity or liability obligations to one or more recipients of Covered
|
||||||
|
Software. However, You may do so only on Your own behalf, and not on
|
||||||
|
behalf of any Contributor. You must make it absolutely clear that any
|
||||||
|
such warranty, support, indemnity, or liability obligation is offered by
|
||||||
|
You alone, and You hereby agree to indemnify every Contributor for any
|
||||||
|
liability incurred by such Contributor as a result of warranty, support,
|
||||||
|
indemnity or liability terms You offer. You may include additional
|
||||||
|
disclaimers of warranty and limitations of liability specific to any
|
||||||
|
jurisdiction.
|
||||||
|
|
||||||
|
4. Inability to Comply Due to Statute or Regulation
|
||||||
|
---------------------------------------------------
|
||||||
|
|
||||||
|
If it is impossible for You to comply with any of the terms of this
|
||||||
|
License with respect to some or all of the Covered Software due to
|
||||||
|
statute, judicial order, or regulation then You must: (a) comply with
|
||||||
|
the terms of this License to the maximum extent possible; and (b)
|
||||||
|
describe the limitations and the code they affect. Such description must
|
||||||
|
be placed in a text file included with all distributions of the Covered
|
||||||
|
Software under this License. Except to the extent prohibited by statute
|
||||||
|
or regulation, such description must be sufficiently detailed for a
|
||||||
|
recipient of ordinary skill to be able to understand it.
|
||||||
|
|
||||||
|
5. Termination
|
||||||
|
--------------
|
||||||
|
|
||||||
|
5.1. The rights granted under this License will terminate automatically
|
||||||
|
if You fail to comply with any of its terms. However, if You become
|
||||||
|
compliant, then the rights granted under this License from a particular
|
||||||
|
Contributor are reinstated (a) provisionally, unless and until such
|
||||||
|
Contributor explicitly and finally terminates Your grants, and (b) on an
|
||||||
|
ongoing basis, if such Contributor fails to notify You of the
|
||||||
|
non-compliance by some reasonable means prior to 60 days after You have
|
||||||
|
come back into compliance. Moreover, Your grants from a particular
|
||||||
|
Contributor are reinstated on an ongoing basis if such Contributor
|
||||||
|
notifies You of the non-compliance by some reasonable means, this is the
|
||||||
|
first time You have received notice of non-compliance with this License
|
||||||
|
from such Contributor, and You become compliant prior to 30 days after
|
||||||
|
Your receipt of the notice.
|
||||||
|
|
||||||
|
5.2. If You initiate litigation against any entity by asserting a patent
|
||||||
|
infringement claim (excluding declaratory judgment actions,
|
||||||
|
counter-claims, and cross-claims) alleging that a Contributor Version
|
||||||
|
directly or indirectly infringes any patent, then the rights granted to
|
||||||
|
You by any and all Contributors for the Covered Software under Section
|
||||||
|
2.1 of this License shall terminate.
|
||||||
|
|
||||||
|
5.3. In the event of termination under Sections 5.1 or 5.2 above, all
|
||||||
|
end user license agreements (excluding distributors and resellers) which
|
||||||
|
have been validly granted by You or Your distributors under this License
|
||||||
|
prior to termination shall survive termination.
|
||||||
|
|
||||||
|
************************************************************************
|
||||||
|
* *
|
||||||
|
* 6. Disclaimer of Warranty *
|
||||||
|
* ------------------------- *
|
||||||
|
* *
|
||||||
|
* Covered Software is provided under this License on an "as is" *
|
||||||
|
* basis, without warranty of any kind, either expressed, implied, or *
|
||||||
|
* statutory, including, without limitation, warranties that the *
|
||||||
|
* Covered Software is free of defects, merchantable, fit for a *
|
||||||
|
* particular purpose or non-infringing. The entire risk as to the *
|
||||||
|
* quality and performance of the Covered Software is with You. *
|
||||||
|
* Should any Covered Software prove defective in any respect, You *
|
||||||
|
* (not any Contributor) assume the cost of any necessary servicing, *
|
||||||
|
* repair, or correction. This disclaimer of warranty constitutes an *
|
||||||
|
* essential part of this License. No use of any Covered Software is *
|
||||||
|
* authorized under this License except under this disclaimer. *
|
||||||
|
* *
|
||||||
|
************************************************************************
|
||||||
|
|
||||||
|
************************************************************************
|
||||||
|
* *
|
||||||
|
* 7. Limitation of Liability *
|
||||||
|
* -------------------------- *
|
||||||
|
* *
|
||||||
|
* Under no circumstances and under no legal theory, whether tort *
|
||||||
|
* (including negligence), contract, or otherwise, shall any *
|
||||||
|
* Contributor, or anyone who distributes Covered Software as *
|
||||||
|
* permitted above, be liable to You for any direct, indirect, *
|
||||||
|
* special, incidental, or consequential damages of any character *
|
||||||
|
* including, without limitation, damages for lost profits, loss of *
|
||||||
|
* goodwill, work stoppage, computer failure or malfunction, or any *
|
||||||
|
* and all other commercial damages or losses, even if such party *
|
||||||
|
* shall have been informed of the possibility of such damages. This *
|
||||||
|
* limitation of liability shall not apply to liability for death or *
|
||||||
|
* personal injury resulting from such party's negligence to the *
|
||||||
|
* extent applicable law prohibits such limitation. Some *
|
||||||
|
* jurisdictions do not allow the exclusion or limitation of *
|
||||||
|
* incidental or consequential damages, so this exclusion and *
|
||||||
|
* limitation may not apply to You. *
|
||||||
|
* *
|
||||||
|
************************************************************************
|
||||||
|
|
||||||
|
8. Litigation
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Any litigation relating to this License may be brought only in the
|
||||||
|
courts of a jurisdiction where the defendant maintains its principal
|
||||||
|
place of business and such litigation shall be governed by laws of that
|
||||||
|
jurisdiction, without reference to its conflict-of-law provisions.
|
||||||
|
Nothing in this Section shall prevent a party's ability to bring
|
||||||
|
cross-claims or counter-claims.
|
||||||
|
|
||||||
|
9. Miscellaneous
|
||||||
|
----------------
|
||||||
|
|
||||||
|
This License represents the complete agreement concerning the subject
|
||||||
|
matter hereof. If any provision of this License is held to be
|
||||||
|
unenforceable, such provision shall be reformed only to the extent
|
||||||
|
necessary to make it enforceable. Any law or regulation which provides
|
||||||
|
that the language of a contract shall be construed against the drafter
|
||||||
|
shall not be used to construe this License against a Contributor.
|
||||||
|
|
||||||
|
10. Versions of the License
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
10.1. New Versions
|
||||||
|
|
||||||
|
Mozilla Foundation is the license steward. Except as provided in Section
|
||||||
|
10.3, no one other than the license steward has the right to modify or
|
||||||
|
publish new versions of this License. Each version will be given a
|
||||||
|
distinguishing version number.
|
||||||
|
|
||||||
|
10.2. Effect of New Versions
|
||||||
|
|
||||||
|
You may distribute the Covered Software under the terms of the version
|
||||||
|
of the License under which You originally received the Covered Software,
|
||||||
|
or under the terms of any subsequent version published by the license
|
||||||
|
steward.
|
||||||
|
|
||||||
|
10.3. Modified Versions
|
||||||
|
|
||||||
|
If you create software not governed by this License, and you want to
|
||||||
|
create a new license for such software, you may create and use a
|
||||||
|
modified version of this License if you rename the license and remove
|
||||||
|
any references to the name of the license steward (except to note that
|
||||||
|
such modified license differs from this License).
|
||||||
|
|
||||||
|
10.4. Distributing Source Code Form that is Incompatible With Secondary
|
||||||
|
Licenses
|
||||||
|
|
||||||
|
If You choose to distribute Source Code Form that is Incompatible With
|
||||||
|
Secondary Licenses under the terms of this version of the License, the
|
||||||
|
notice described in Exhibit B of this License must be attached.
|
||||||
|
|
||||||
|
Exhibit A - Source Code Form License Notice
|
||||||
|
-------------------------------------------
|
||||||
|
|
||||||
|
This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
|
License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
If it is not possible or desirable to put the notice in a particular
|
||||||
|
file, then You may include the notice in a location (such as a LICENSE
|
||||||
|
file in a relevant directory) where a recipient would be likely to look
|
||||||
|
for such a notice.
|
||||||
|
|
||||||
|
You may add additional accurate notices of copyright ownership.
|
||||||
|
|
||||||
|
Exhibit B - "Incompatible With Secondary Licenses" Notice
|
||||||
|
---------------------------------------------------------
|
||||||
|
|
||||||
|
This Source Code Form is "Incompatible With Secondary Licenses", as
|
||||||
|
defined by the Mozilla Public License, v. 2.0.
|
||||||
268
README.md
Normal file
268
README.md
Normal file
|
|
@ -0,0 +1,268 @@
|
||||||
|
# csv-sanity
|
||||||
|
|
||||||
|
Preserve your sanity is a world full of malformed, poorly validated CSV files.
|
||||||
|
Sanitize and transform large CSVs with millions of records quickly and
|
||||||
|
efficiently.
|
||||||
|
|
||||||
|
**NOTE:** csv-sanity is in an alpha state and is subject to breaking changes.
|
||||||
|
The ruleset file syntax in particular is likely to change in the near future.
|
||||||
|
I've personally used csv-sanity on a number of projects and it has been
|
||||||
|
incredibly helpful, but as with most alpha software csv-sanity is provided
|
||||||
|
as-is and provides no warranty or guarantee. Use at your own risk and double
|
||||||
|
check your transformed files!
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
The CSV format is not well-standardized and has many shortfalls when it comes to
|
||||||
|
storing large numbers of records with complex data formats, but CSVs are
|
||||||
|
ubiquitous in many realms as a neutral interchange format that most CRMs and
|
||||||
|
database software can parse and understand.
|
||||||
|
|
||||||
|
But what happens when your CRM can only parse ISO 8601 formatted dates and the
|
||||||
|
CSV you inherited has dates in another format such as the following:
|
||||||
|
|
||||||
|
```csv
|
||||||
|
id,name,signup_date
|
||||||
|
2,John Doe,11/22/2017
|
||||||
|
3,Jane Doe,11/28/2017
|
||||||
|
```
|
||||||
|
|
||||||
|
Or you received a CSV of people who you need to contact via a personalized
|
||||||
|
email, but your contacts' names in the CSV are in ALL CAPS:
|
||||||
|
|
||||||
|
```csv
|
||||||
|
id,first_name,last_name
|
||||||
|
2,JOHN,DOE
|
||||||
|
3,JANE,DOE
|
||||||
|
```
|
||||||
|
|
||||||
|
Or you have a CSV that has valid values for the vast majority of records, but 1
|
||||||
|
out of every 20k records has nonsense values that cause your entire import to
|
||||||
|
abort:
|
||||||
|
|
||||||
|
```csv
|
||||||
|
id,fist_name,last_name,party_registration
|
||||||
|
2,Jane,Doe,REP
|
||||||
|
3,John,Doe,DEM
|
||||||
|
345,Josh,Smith,HAHAHAHA
|
||||||
|
```
|
||||||
|
|
||||||
|
Or even a CSV that has a few malformed records due to unescaped commas:
|
||||||
|
|
||||||
|
```csv
|
||||||
|
id,first_name,last_name,email
|
||||||
|
2,Jane,Doe,jane@example.com
|
||||||
|
3,John,Doe,"i,don't,follow,the,rules"@example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
These are all real problems I've encountered with CSVs over the years. If the
|
||||||
|
CSV is small enough they can be corrected by hand, but for CSVs with 10k, 100k
|
||||||
|
or even millions of records correcting by hand simply isn't a viable option.
|
||||||
|
|
||||||
|
`csv-sanity` aims to solve the issue of sanitizing large, poorly-validated CSVs.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
`csv-sanity` is an executable that takes an input CSV to process and a JSON
|
||||||
|
ruleset file defining the transformation rules to apply:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
csv-sanity [-r RULESET_FILE] <INPUT_FILE>
|
||||||
|
```
|
||||||
|
|
||||||
|
If a path to a ruleset file is not provided via the `-r` option, `csv-sanity`
|
||||||
|
will look for a file named "ruleset.json" in the current directory.
|
||||||
|
|
||||||
|
By default, `csv-sanity` outputs two files to the current directory:
|
||||||
|
output.csv, which contains the processed CSV with validated and transformed
|
||||||
|
records, and errors.csv, which contains a list of records and fields that
|
||||||
|
couldn't be processed and reasons they were rejected. The paths where the output
|
||||||
|
and error files are output can be overridden via the `-o FILE_PATH` and
|
||||||
|
`-e FILE_PATH` options, respectively.
|
||||||
|
|
||||||
|
## ruleset.json Syntax
|
||||||
|
|
||||||
|
Ruleset files are JSON files that define a collection of transformation rules
|
||||||
|
and the fields to which they should be applied.
|
||||||
|
|
||||||
|
The following is an example ruleset JSON file:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"applicability": {
|
||||||
|
"Global": [],
|
||||||
|
},
|
||||||
|
"transformer": {
|
||||||
|
"None": {
|
||||||
|
"regex": "\\A(?:[:cntrl:]|\\s)*\\z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"priority": -10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"applicability": {
|
||||||
|
"Global": [],
|
||||||
|
},
|
||||||
|
"transformer": {
|
||||||
|
"Trim": {}
|
||||||
|
},
|
||||||
|
"priority": -10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"applicability": {
|
||||||
|
"Fields": {
|
||||||
|
"field_names": [
|
||||||
|
"first_name",
|
||||||
|
"last_name"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"transformer": {
|
||||||
|
"Capitalize": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Every ruleset.json file is a JSON object with a single "rules" field with an
|
||||||
|
array of rule objects.
|
||||||
|
|
||||||
|
Rules are objects with two fields:
|
||||||
|
|
||||||
|
- **"applicability"**: specifies whether a rule applies globally or only to a
|
||||||
|
predefined set of fields (specified as the column headers in the CSV being
|
||||||
|
processed)
|
||||||
|
- **"transformer"**: a transformer object, which specifies how the applicable
|
||||||
|
fields should be transformed.
|
||||||
|
|
||||||
|
### Transformers
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#### Capitalize
|
||||||
|
|
||||||
|
Transforms string fields into Capital Case.
|
||||||
|
|
||||||
|
#### Choice
|
||||||
|
|
||||||
|
Only accepts a pre-defined list of acceptable values and rejects the rest.
|
||||||
|
|
||||||
|
#### Date
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"Date": {
|
||||||
|
"input_formats": [
|
||||||
|
"%m/%d/%Y"
|
||||||
|
],
|
||||||
|
"output_formats": "%F"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Attempt to parse fields with a list of datetime formats via
|
||||||
|
[time::strptime](https://docs.rs/time/0.1.37/time/fn.strptime.html). See the
|
||||||
|
docs for the [time](https://docs.rs/time/0.1.37/time/index.html) crate for
|
||||||
|
details on datetime formating syntax.
|
||||||
|
|
||||||
|
#### Email
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"Email": {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Attempt to parse fields as email addresses, rejecting any fields that appear to
|
||||||
|
be invalid email addresses.
|
||||||
|
|
||||||
|
#### None
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"None": {
|
||||||
|
"regex": "\\A(?:[:cntrl:]|\\s)*\\z"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace matched fields with a blank value. Useful as a global rule for
|
||||||
|
normalizing blank fields in a CSV file.
|
||||||
|
|
||||||
|
#### Number
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"Number": {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Attempt to parse fields as whole integers, rejecting any fields that cannot be
|
||||||
|
parsed.
|
||||||
|
|
||||||
|
#### PhoneNumber
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"PhoneNumber": {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Attempt to parse files as US, NANP-formatted phone numbers, transforming them
|
||||||
|
into a standard international format of `+1 <area_code> <exchange_code> <subscriber_number>`.
|
||||||
|
|
||||||
|
#### Regex
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"Regex": {
|
||||||
|
"regex": "\\A([A-Z])[A-Z]+\\z",
|
||||||
|
"template": "$1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Match fields against the provided regex pattern and transform them according to
|
||||||
|
the template string, replacing capture groups placeholders. See the
|
||||||
|
[Regex::replace](https://docs.rs/regex/0.2.1/regex/struct.Regex.html#method.replace)
|
||||||
|
in the regex crate docs for details.
|
||||||
|
|
||||||
|
#### RegexMatch
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"RegexMatch": {
|
||||||
|
"regex": "\\A[A-Z]{2,3}\\z",
|
||||||
|
"negate": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Reject any fields that fail to match against the provided regex pattern. If
|
||||||
|
`negate` is `true`, the reject any fields that match the provided regex pattern
|
||||||
|
instead.
|
||||||
|
|
||||||
|
#### Trim
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"Trim": {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Trim leading and trailing whitespace from fields. Useful as a global rule to
|
||||||
|
normalize fields and remove useless whitespace.
|
||||||
|
|
||||||
|
#### Zipcode
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"Zipcode": {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Attempt to parse fields as US zip codes in the formats "xxxxx" and "xxxxx-xxxx",
|
||||||
|
rejecting any fields that fail to match that format.
|
||||||
193
src/cli.rs
Normal file
193
src/cli.rs
Normal file
|
|
@ -0,0 +1,193 @@
|
||||||
|
//! Command line interface.
|
||||||
|
|
||||||
|
use std::fs::File;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use {
|
||||||
|
Ruleset,
|
||||||
|
TransformError,
|
||||||
|
TransformedRecord,
|
||||||
|
};
|
||||||
|
|
||||||
|
use csv;
|
||||||
|
|
||||||
|
/// Configuration options for the `Cli`.
|
||||||
|
pub struct Options
|
||||||
|
{
|
||||||
|
/// See `CsvOptions`.
|
||||||
|
pub csv_options: CsvOptions,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Options {
|
||||||
|
fn default() -> Options {
|
||||||
|
Options {
|
||||||
|
csv_options: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `Cli` configuration options specific to how to parse the CSV file.
|
||||||
|
///
|
||||||
|
/// `CsvOptions` implements `Default` with the following defaults:
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// extern crate csv;
|
||||||
|
/// use csv_sanity::cli::CsvOptions;
|
||||||
|
/// use csv::RecordTerminator;
|
||||||
|
///
|
||||||
|
/// let defaults = CsvOptions {
|
||||||
|
/// delimiter: b',',
|
||||||
|
/// record_terminator: csv::RecordTerminator::CRLF,
|
||||||
|
/// quote: b'"',
|
||||||
|
/// escape: None,
|
||||||
|
/// double_quote: true,
|
||||||
|
/// };
|
||||||
|
/// assert_eq!(defaults, Default::default());
|
||||||
|
/// ```
|
||||||
|
pub struct CsvOptions
|
||||||
|
{
|
||||||
|
/// Field delimeter to expect in the CSV file.
|
||||||
|
///
|
||||||
|
/// Corresponds to the `csv::Reader.delimiter` method.
|
||||||
|
pub delimiter: u8,
|
||||||
|
/// Record terminator to expect in the CSV file.
|
||||||
|
///
|
||||||
|
/// Corresponds to the `csv::Reader.record_terminator` method. See `csv::RecordTerminator`.
|
||||||
|
pub record_terminator: csv::RecordTerminator,
|
||||||
|
/// Field quotation character to expect in the CSV file.
|
||||||
|
///
|
||||||
|
/// Corresponds to the `csv::Reader.quote` method.
|
||||||
|
pub quote: u8,
|
||||||
|
/// Escape character to expect in the CSV file.
|
||||||
|
///
|
||||||
|
/// Corresponds to the `csv::Reader.escape` method.
|
||||||
|
pub escape: Option<u8>,
|
||||||
|
/// Whether two adjacent quote characters should be interpreted as an escaped quote character.
|
||||||
|
///
|
||||||
|
/// Corresponds to the `csv::Reader.double_quote` method.
|
||||||
|
pub double_quote: bool
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for CsvOptions
|
||||||
|
{
|
||||||
|
fn default() -> CsvOptions {
|
||||||
|
CsvOptions {
|
||||||
|
delimiter: b',',
|
||||||
|
record_terminator: csv::RecordTerminator::CRLF,
|
||||||
|
quote: b'"',
|
||||||
|
escape: None,
|
||||||
|
double_quote: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Command line interface for running a `Ruleset` against a CSV file.
|
||||||
|
pub struct Cli
|
||||||
|
{
|
||||||
|
options: Options,
|
||||||
|
ruleset: Ruleset,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Cli
|
||||||
|
{
|
||||||
|
/// Construct a new `Cli` with default options.
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use csv_sanity::Ruleset;
|
||||||
|
/// use csv_sanity::cli::{
|
||||||
|
/// Cli
|
||||||
|
/// };
|
||||||
|
///
|
||||||
|
/// let ruleset = Ruleset::new();
|
||||||
|
/// let cli = Cli::new(ruleset);
|
||||||
|
/// ```
|
||||||
|
pub fn new(ruleset: Ruleset) -> Cli {
|
||||||
|
Self::new_with_options(ruleset, Default::default())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct a new `Cli` with the specified options.
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use csv_sanity::Ruleset;
|
||||||
|
/// use csv_sanity::cli::{
|
||||||
|
/// Cli,
|
||||||
|
/// Options,
|
||||||
|
/// CsvOptions
|
||||||
|
/// };
|
||||||
|
///
|
||||||
|
/// let ruleset = Ruleset::new();
|
||||||
|
/// let cli = Cli::new_with_options(ruleset, Options {
|
||||||
|
/// csv_options: CsvOptions {
|
||||||
|
/// delimiter: b',',
|
||||||
|
/// .. Default::default()
|
||||||
|
/// },
|
||||||
|
/// .. Default::default()
|
||||||
|
/// });
|
||||||
|
/// ```
|
||||||
|
pub fn new_with_options(ruleset: Ruleset, options: Options) -> Cli {
|
||||||
|
Cli {
|
||||||
|
options: options,
|
||||||
|
ruleset: ruleset,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run<I: AsRef<Path>, O: AsRef<Path>, E: AsRef<Path>>(&self, input_file_path: I, output_file_name: O, error_file_name: E) {
|
||||||
|
let (mut reader, headers) = self.reader_from_file(input_file_path);
|
||||||
|
|
||||||
|
let mut output_writer = csv::Writer::from_file(output_file_name).expect("Unable to open output file for writing");
|
||||||
|
let mut output_headers = headers.clone();
|
||||||
|
output_headers.insert(0, "Record Number".to_string());
|
||||||
|
output_writer.encode(output_headers).expect("Unable to write to output file");
|
||||||
|
|
||||||
|
let mut error_writer = csv::Writer::from_file(error_file_name).expect("Unable to open error file for writing");
|
||||||
|
let error_headers = vec![
|
||||||
|
"Record Number",
|
||||||
|
"Field Name",
|
||||||
|
"Field Value",
|
||||||
|
"Reason",
|
||||||
|
];
|
||||||
|
error_writer.encode(error_headers).expect("Unable to write to error file");
|
||||||
|
|
||||||
|
for (record_n, record) in reader.records().enumerate() {
|
||||||
|
let original_line_n = record_n + 2; // Plus one for headers and plus one for zero-indexing.
|
||||||
|
let transformed_record: TransformedRecord = match record {
|
||||||
|
Err(e) => {
|
||||||
|
let err = TransformError {
|
||||||
|
field_value: "".to_string(),
|
||||||
|
field_name: "".to_string(),
|
||||||
|
record_n: original_line_n,
|
||||||
|
reason: format!("{}", e),
|
||||||
|
};
|
||||||
|
error_writer.encode(err).expect("Unable to write to error file");
|
||||||
|
continue;
|
||||||
|
},
|
||||||
|
Ok(ref rec) => self.ruleset.apply_rules(&headers, rec, original_line_n)
|
||||||
|
};
|
||||||
|
let record_fields: Vec<Option<String>> = {
|
||||||
|
let mut fs = vec![Some(original_line_n.to_string())];
|
||||||
|
fs.extend(transformed_record.field_values);
|
||||||
|
fs
|
||||||
|
};
|
||||||
|
output_writer.encode(record_fields).expect("Unable to write to output file");
|
||||||
|
for error in transformed_record.errors {
|
||||||
|
error_writer.encode(error).expect("Unable to write to error file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reader_from_file<P: AsRef<Path>>(&self, path: P) -> (csv::Reader<File>, Vec<String>) {
|
||||||
|
let mut reader = csv::Reader::from_file(path.as_ref().clone()).map(|r| {
|
||||||
|
// Configure the reader according to the options passed to the Cli constructor.
|
||||||
|
r.has_headers(true)
|
||||||
|
.delimiter(self.options.csv_options.delimiter)
|
||||||
|
.record_terminator(self.options.csv_options.record_terminator)
|
||||||
|
.quote(self.options.csv_options.quote)
|
||||||
|
.escape(self.options.csv_options.escape)
|
||||||
|
.double_quote(self.options.csv_options.double_quote)
|
||||||
|
.flexible(true)
|
||||||
|
}).expect(&format!("Unable to read file {}", path.as_ref().display()));
|
||||||
|
let headers = reader.headers()
|
||||||
|
.expect(&format!("Unable to read headers from input file {}", path.as_ref().display()));
|
||||||
|
(reader, headers)
|
||||||
|
}
|
||||||
|
}
|
||||||
36
src/lib.rs
Normal file
36
src/lib.rs
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
#[macro_use]
|
||||||
|
extern crate serde_derive;
|
||||||
|
extern crate serde;
|
||||||
|
extern crate serde_json;
|
||||||
|
extern crate regex;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate lazy_static;
|
||||||
|
extern crate unicode_segmentation;
|
||||||
|
extern crate time;
|
||||||
|
extern crate csv;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate custom_derive;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate newtype_derive;
|
||||||
|
extern crate rustc_serialize;
|
||||||
|
|
||||||
|
mod newtypes;
|
||||||
|
|
||||||
|
pub mod transformer;
|
||||||
|
pub use transformer::{
|
||||||
|
Transformer,
|
||||||
|
TransformResult,
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformError
|
||||||
|
};
|
||||||
|
|
||||||
|
pub mod transformers;
|
||||||
|
|
||||||
|
mod ruleset;
|
||||||
|
pub use ruleset::{
|
||||||
|
Rule,
|
||||||
|
Ruleset,
|
||||||
|
TransformedRecord,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub mod cli;
|
||||||
111
src/main.rs
Normal file
111
src/main.rs
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
extern crate csv_sanity;
|
||||||
|
|
||||||
|
extern crate serde_json;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate log;
|
||||||
|
extern crate regex;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate clap;
|
||||||
|
|
||||||
|
use csv_sanity::cli::{
|
||||||
|
self,
|
||||||
|
Cli,
|
||||||
|
};
|
||||||
|
|
||||||
|
use std::fs::File;
|
||||||
|
use std::path::Path;
|
||||||
|
use log::{
|
||||||
|
LogRecord,
|
||||||
|
LogLevel,
|
||||||
|
LogMetadata,
|
||||||
|
LogLevelFilter,
|
||||||
|
SetLoggerError
|
||||||
|
};
|
||||||
|
use clap::{
|
||||||
|
App,
|
||||||
|
Arg
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ConsoleLogger {
|
||||||
|
log_level: LogLevel
|
||||||
|
}
|
||||||
|
|
||||||
|
impl log::Log for ConsoleLogger {
|
||||||
|
fn enabled(&self, metadata: &LogMetadata) -> bool {
|
||||||
|
metadata.level() <= self.log_level
|
||||||
|
}
|
||||||
|
|
||||||
|
fn log(&self, record: &LogRecord) {
|
||||||
|
if self.enabled(record.metadata()) {
|
||||||
|
println!("{} - {}", record.level(), record.args())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn init_logging() -> Result<(), SetLoggerError> {
|
||||||
|
log::set_logger(|max_log_level| {
|
||||||
|
max_log_level.set(LogLevelFilter::Info);
|
||||||
|
Box::new(ConsoleLogger { log_level: LogLevel::Info })
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
init_logging().unwrap();
|
||||||
|
|
||||||
|
let matches = App::new("Convert CSV")
|
||||||
|
.version(crate_version!())
|
||||||
|
.author("M. George Hansen <technopolitica@gmail.com>")
|
||||||
|
.about("Apply a set of transformations to the records in a CSV file, attempting to read a much valid information from the file as possible.")
|
||||||
|
.arg(Arg::with_name("INPUT_FILE")
|
||||||
|
.help("CSV file to process")
|
||||||
|
.required(true)
|
||||||
|
.index(1))
|
||||||
|
.arg(Arg::with_name("output")
|
||||||
|
.help("File to output the transformed CSV records. Defaults to ./output.csv")
|
||||||
|
.short("o")
|
||||||
|
.long("output")
|
||||||
|
.takes_value(true))
|
||||||
|
.arg(Arg::with_name("error_output")
|
||||||
|
.help("File to output errors in CSV format. Defaults to ./errors.csv")
|
||||||
|
.short("e")
|
||||||
|
.long("error_output")
|
||||||
|
.takes_value(true))
|
||||||
|
.arg(Arg::with_name("ruleset")
|
||||||
|
.help("JSON file containing the ruleset to apply. Defaults to ./ruleset.json")
|
||||||
|
.short("r")
|
||||||
|
.long("ruleset")
|
||||||
|
.takes_value(true))
|
||||||
|
.get_matches();
|
||||||
|
|
||||||
|
let ruleset_file_path = Path::new(matches.value_of("ruleset").unwrap_or("ruleset.json"));
|
||||||
|
let ruleset_file = match File::open(ruleset_file_path) {
|
||||||
|
Ok(f) => f,
|
||||||
|
Err(e) => exit_with_error(&format!("unable to read ruleset file {}: {}", ruleset_file_path.display(), e))
|
||||||
|
};
|
||||||
|
let ruleset = match serde_json::from_reader(ruleset_file) {
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) => {
|
||||||
|
exit_with_error(&format!("failed to parse ruleset from {}: {}", ruleset_file_path.display(), e));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let cli_app = Cli::new_with_options(ruleset, cli::Options {
|
||||||
|
csv_options: cli::CsvOptions {
|
||||||
|
delimiter: b'\t',
|
||||||
|
.. Default::default()
|
||||||
|
},
|
||||||
|
.. Default::default()
|
||||||
|
});
|
||||||
|
|
||||||
|
// NOTE: Required arguments are validated by clap, so we should be safe to use expect here.
|
||||||
|
let input_file_name = matches.value_of("INPUT_FILE").expect("INPUT_FILE argument could not be found!");
|
||||||
|
let output_file_name = matches.value_of("output_file").unwrap_or("output.csv");
|
||||||
|
let error_file_name = matches.value_of("error_file").unwrap_or("errors.csv");
|
||||||
|
cli_app.run(input_file_name, output_file_name, error_file_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn exit_with_error(error_msg: &str) -> !
|
||||||
|
{
|
||||||
|
error!("{}", error_msg);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
59
src/newtypes.rs
Normal file
59
src/newtypes.rs
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
use std::hash::{
|
||||||
|
Hash,
|
||||||
|
Hasher,
|
||||||
|
};
|
||||||
|
use regex;
|
||||||
|
use serde::{
|
||||||
|
Serialize,
|
||||||
|
Serializer,
|
||||||
|
Deserialize,
|
||||||
|
Deserializer,
|
||||||
|
};
|
||||||
|
|
||||||
|
custom_derive! {
|
||||||
|
#[derive(NewtypeFrom, NewtypeDeref, NewtypeDerefMut, Clone, NewtypeDisplay, NewtypeDebug)]
|
||||||
|
pub struct Regex(regex::Regex);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for Regex {
|
||||||
|
fn eq(&self, other: &Regex) -> bool
|
||||||
|
{
|
||||||
|
self.0.as_str() == other.0.as_str()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for Regex {}
|
||||||
|
|
||||||
|
impl Hash for Regex {
|
||||||
|
fn hash<H>(&self, state: &mut H)
|
||||||
|
where H: Hasher {
|
||||||
|
self.as_str().hash(state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Serialize for Regex
|
||||||
|
{
|
||||||
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where S: Serializer {
|
||||||
|
let Regex(ref regex) = *self;
|
||||||
|
regex.as_str().serialize(serializer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de> Deserialize<'de> for Regex
|
||||||
|
{
|
||||||
|
fn deserialize<D>(deserializer: D) -> Result<Regex, D::Error>
|
||||||
|
where D: Deserializer<'de>
|
||||||
|
{
|
||||||
|
use serde::de::{Unexpected, Error};
|
||||||
|
let string: Result<String, D::Error> = Deserialize::deserialize(deserializer);
|
||||||
|
string.and_then(|s| {
|
||||||
|
regex::Regex::new(&s)
|
||||||
|
.map(|r| Regex(r))
|
||||||
|
.map_err(|e| {
|
||||||
|
let message: &str = &format!("invalid regex string: {}", e);
|
||||||
|
D::Error::invalid_value(Unexpected::Str(&s), &message)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
345
src/ruleset.rs
Normal file
345
src/ruleset.rs
Normal file
|
|
@ -0,0 +1,345 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResult,
|
||||||
|
TransformError,
|
||||||
|
};
|
||||||
|
use transformers::{
|
||||||
|
Transformers,
|
||||||
|
TrimTransformer,
|
||||||
|
NoneTransformer,
|
||||||
|
};
|
||||||
|
|
||||||
|
use std::hash::{
|
||||||
|
Hash,
|
||||||
|
Hasher,
|
||||||
|
};
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
use std::collections::{
|
||||||
|
BinaryHeap,
|
||||||
|
HashSet,
|
||||||
|
};
|
||||||
|
use std::error;
|
||||||
|
use std::fmt::{
|
||||||
|
self,
|
||||||
|
Formatter,
|
||||||
|
Display,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Applicability of a `Rule` determining which CSV record's fields it can be applied to.
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||||
|
pub enum Applicability {
|
||||||
|
/// Applicable to all CSV record fields.
|
||||||
|
Global,
|
||||||
|
/// Applicable to a subset of a CSV record's fields, specified by field name.
|
||||||
|
Fields {
|
||||||
|
field_names: HashSet<String>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Hash for Applicability {
|
||||||
|
fn hash<H>(&self, state: &mut H)
|
||||||
|
where H: Hasher {
|
||||||
|
use self::Applicability::*;
|
||||||
|
match *self {
|
||||||
|
Global => (self as *const Applicability).hash(state), // FIXME: Is this the correct way to hash an empty enum variant?
|
||||||
|
Fields { ref field_names } => field_names.iter().collect::<Vec<&String>>().hash(state)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn priority_is_default(priority: &isize) -> bool {
|
||||||
|
priority == &0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A `Transformer` paired with `Applicability` and a priority which can be applied to fields in a
|
||||||
|
/// CSV record.
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct Rule
|
||||||
|
{
|
||||||
|
applicability: Applicability,
|
||||||
|
transformer: Transformers,
|
||||||
|
#[serde(default, skip_serializing_if="priority_is_default")]
|
||||||
|
priority: isize
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Rule
|
||||||
|
{
|
||||||
|
/// Construct a new `Rule` whoe `Transformer` is applicable to one or more CSV record's fields
|
||||||
|
/// referenced by name with the default priority of 0.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
/// ```
|
||||||
|
/// use csv_sanity::Rule;
|
||||||
|
/// use csv_sanity::transformers::*;
|
||||||
|
///
|
||||||
|
/// let rule = Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize(
|
||||||
|
/// CapitalizeTransformer::new()
|
||||||
|
/// ));
|
||||||
|
/// ```
|
||||||
|
pub fn for_fields(field_names: &[&str], transformer: Transformers) -> Rule {
|
||||||
|
Self::for_fields_with_priority(field_names, transformer, Default::default())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct a new `Rule` whoe `Transformer` is applicable to one or more CSV record's fields
|
||||||
|
/// referenced by name with the specified priority.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
/// ```
|
||||||
|
/// use csv_sanity::Rule;
|
||||||
|
/// use csv_sanity::transformers::*;
|
||||||
|
///
|
||||||
|
/// let rule = Rule::for_fields_with_priority(&["Fist Name", "Last Name"], Transformers::Capitalize(
|
||||||
|
/// CapitalizeTransformer::new()
|
||||||
|
/// ), 10);
|
||||||
|
/// ```
|
||||||
|
pub fn for_fields_with_priority(field_names: &[&str], transformer: Transformers, priority: isize) -> Rule {
|
||||||
|
Rule {
|
||||||
|
applicability: Applicability::Fields { field_names: field_names.iter().map(|s| s.to_string()).collect() },
|
||||||
|
transformer: transformer,
|
||||||
|
priority: priority
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct a new `Rule` applicable to all of a CSV record's fields with the default priority
|
||||||
|
/// of 0.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
/// ```
|
||||||
|
/// use csv_sanity::Rule;
|
||||||
|
/// use csv_sanity::transformers::*;
|
||||||
|
///
|
||||||
|
/// let rule = Rule::global(Transformers::Capitalize(
|
||||||
|
/// CapitalizeTransformer::new()
|
||||||
|
/// ));
|
||||||
|
/// ```
|
||||||
|
pub fn global(transformer: Transformers) -> Rule {
|
||||||
|
Self::global_with_priority(transformer, Default::default())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct a new `Rule` applicable to all of a CSV record's fields with the specified
|
||||||
|
/// priority.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
/// ```
|
||||||
|
/// use csv_sanity::Rule;
|
||||||
|
/// use csv_sanity::transformers::*;
|
||||||
|
///
|
||||||
|
/// let rule = Rule::global_with_priority(Transformers::Capitalize(
|
||||||
|
/// CapitalizeTransformer::new()
|
||||||
|
/// ), 10);
|
||||||
|
/// ```
|
||||||
|
pub fn global_with_priority(transformer: Transformers, priority: isize) -> Rule {
|
||||||
|
Rule {
|
||||||
|
applicability: Applicability::Global,
|
||||||
|
transformer: transformer,
|
||||||
|
priority: priority
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apply this rule to a CSV record's field, returning the resulting `TransformResult`.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
/// ```
|
||||||
|
/// use csv_sanity::Rule;
|
||||||
|
/// use csv_sanity::transformers::*;
|
||||||
|
///
|
||||||
|
/// let field = "JOHN";
|
||||||
|
/// let field_name = "First Name";
|
||||||
|
///
|
||||||
|
/// let rule = Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize(
|
||||||
|
/// CapitalizeTransformer::new()
|
||||||
|
/// ));
|
||||||
|
/// rule.apply(field, field_name, 1);
|
||||||
|
/// ```
|
||||||
|
pub fn apply(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||||
|
// XXX: Does the applicability check belong inside the apply method? Or should the caller
|
||||||
|
// decide?
|
||||||
|
match self.applicability {
|
||||||
|
Applicability::Global => self.transformer.transform(field_value, field_name, record_n),
|
||||||
|
Applicability::Fields { ref field_names } if field_names.contains(&field_name.to_string()) => {
|
||||||
|
self.transformer.transform(field_value, field_name, record_n)
|
||||||
|
},
|
||||||
|
_ => Ok(Some(field_value.to_string()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ord for Rule
|
||||||
|
{
|
||||||
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
other.priority.cmp(&self.priority)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialOrd for Rule
|
||||||
|
{
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An ordered set of `Rule`s sorted by priority.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
/// ```
|
||||||
|
/// use csv_sanity::{
|
||||||
|
/// Ruleset,
|
||||||
|
/// Rule,
|
||||||
|
/// TransformedRecord,
|
||||||
|
/// };
|
||||||
|
/// use csv_sanity::transformers::*;
|
||||||
|
/// let ruleset = {
|
||||||
|
/// let mut r = Ruleset::new();
|
||||||
|
/// r.add_rule(Rule::for_fields(&["First Name", "Last Name"], Transformers::Capitalize(
|
||||||
|
/// CapitalizeTransformer::new()
|
||||||
|
/// )));
|
||||||
|
/// r.add_rule(Rule::for_fields(&["Email"], Transformers::Email(
|
||||||
|
/// EmailTransformer::new()
|
||||||
|
/// )));
|
||||||
|
/// r
|
||||||
|
/// };
|
||||||
|
/// let headers = vec!["Id", "First Name", "Last Name", "Email"].iter().map(|s| s.to_string()).collect();
|
||||||
|
/// let record = vec!["1", " JOHN", "SNOW ", "\t JSNOW@EXAMPLE.COM "].iter().map(|s| s.to_string()).collect();
|
||||||
|
/// let transformed_record = ruleset.apply_rules(&headers, &record, 1);
|
||||||
|
/// assert_eq!(TransformedRecord {
|
||||||
|
/// field_values: vec!["1", "John", "Snow", "jsnow@example.com"].iter().map(|s| Some(s.to_string())).collect(),
|
||||||
|
/// errors: Vec::new(),
|
||||||
|
/// }, transformed_record);
|
||||||
|
/// ```
|
||||||
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||||
|
pub struct Ruleset {
|
||||||
|
rules: BinaryHeap<Rule>
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ruleset {
|
||||||
|
/// Construct a new `Ruleset` with a default `NoneTransformer` and `TrimTransformer` global
|
||||||
|
/// rules.
|
||||||
|
///
|
||||||
|
/// The default trim and none rules should be appropriate for most CSV files. For CSV files
|
||||||
|
/// where these default rules are not desired use the `Ruleset::without_default_rules` method.
|
||||||
|
pub fn new() -> Ruleset {
|
||||||
|
let mut ruleset = Self::without_default_rules();
|
||||||
|
// Add a default trim rule and blank rule to match empty fields.
|
||||||
|
ruleset.add_rule(Rule::global_with_priority(Transformers::None(NoneTransformer::with_blank_matcher()), -10));
|
||||||
|
ruleset.add_rule(Rule::global_with_priority(Transformers::Trim(TrimTransformer::new()), -10));
|
||||||
|
ruleset
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct a new `Ruleset` without any of the default rules.
|
||||||
|
pub fn without_default_rules() -> Ruleset {
|
||||||
|
Ruleset {
|
||||||
|
rules: BinaryHeap::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a `Rule` to the this ruleset.
|
||||||
|
pub fn add_rule(&mut self, rule: Rule) {
|
||||||
|
self.rules.push(rule);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validate this ruleset against a CSV file by comparing it's `Rule`s against the headers.
|
||||||
|
pub fn validate_rules(&self, headers: &Vec<String>) -> Result<(), Vec<ValidationError>> {
|
||||||
|
let mut errors = Vec::new();
|
||||||
|
for rule in self.rules.iter() {
|
||||||
|
if let Applicability::Fields { ref field_names } = rule.applicability {
|
||||||
|
let header_set = HashSet::<String>::from_iter(headers.clone());
|
||||||
|
let field_set = HashSet::<String>::from_iter(field_names.clone());
|
||||||
|
let diff: HashSet<String> = field_set.difference(&header_set).cloned().collect();
|
||||||
|
if diff.len() > 0 {
|
||||||
|
// FIXME: We should have a better way to construct a ruleset that uses Result
|
||||||
|
// instead of panic! here.
|
||||||
|
errors.push(
|
||||||
|
ValidationError {
|
||||||
|
reason: format!("The following fields were not found in headers: '{:?}'", diff),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if errors.is_empty() {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(errors)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apply this `Ruleset` to a record from a CSV file.
|
||||||
|
pub fn apply_rules(&self, headers: &Vec<String>, fields: &Vec<String>, record_n: usize) -> TransformedRecord {
|
||||||
|
let expected_n_fields = headers.len();
|
||||||
|
|
||||||
|
let mut errors: Vec<TransformError> = Vec::new();
|
||||||
|
let mut transformed_fields: Vec<Option<String>> = Vec::new();
|
||||||
|
for (field_n, field_value) in fields.iter().enumerate() {
|
||||||
|
if field_n < expected_n_fields {
|
||||||
|
let field_name = &headers[field_n];
|
||||||
|
let mut transformed_field_value = Some(field_value.clone());
|
||||||
|
// Try each rule in order of priority and test to see if it is applicable.
|
||||||
|
for rule in self.rules.iter() {
|
||||||
|
let new_value = match transformed_field_value {
|
||||||
|
Some(ref fv) => {
|
||||||
|
let transform_result = rule.apply(fv, &field_name, record_n);
|
||||||
|
match transform_result {
|
||||||
|
Ok(tfv) => tfv,
|
||||||
|
Err(e) => {
|
||||||
|
errors.push(e);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// The last transformer returned None, so we can short circuit and just
|
||||||
|
// return None for the field value.
|
||||||
|
None => break
|
||||||
|
};
|
||||||
|
transformed_field_value = new_value;
|
||||||
|
}
|
||||||
|
transformed_fields.insert(field_n, transformed_field_value);
|
||||||
|
} else {
|
||||||
|
errors.push(
|
||||||
|
TransformError {
|
||||||
|
field_value: field_value.to_string(),
|
||||||
|
field_name: field_n.to_string(),
|
||||||
|
record_n: record_n,
|
||||||
|
reason: format!("found {} header fields but record had extra field at position {}", expected_n_fields, field_n)
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TransformedRecord {
|
||||||
|
field_values: transformed_fields,
|
||||||
|
errors: errors,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Error for when a `Ruleset` does not validate against a CSV file.
|
||||||
|
#[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Hash, Debug)]
|
||||||
|
pub struct ValidationError {
|
||||||
|
reason: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for ValidationError
|
||||||
|
{
|
||||||
|
fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
|
||||||
|
write!(formatter, "{}", self.reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl error::Error for ValidationError
|
||||||
|
{
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
&self.reason
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A single processed and transformed record.
|
||||||
|
#[derive(Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
|
||||||
|
pub struct TransformedRecord {
|
||||||
|
/// Transformed fields for the record.
|
||||||
|
///
|
||||||
|
/// Empty field are explicitly encoded as `None` values.
|
||||||
|
pub field_values: Vec<Option<String>>,
|
||||||
|
/// Errors that were encountered during transformation, if any.
|
||||||
|
pub errors: Vec<TransformError>,
|
||||||
|
}
|
||||||
91
src/transformer.rs
Normal file
91
src/transformer.rs
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
//! Traits and types that define transformations on CSV record fields.
|
||||||
|
|
||||||
|
use std::result;
|
||||||
|
use std::error;
|
||||||
|
use std::fmt::{
|
||||||
|
self,
|
||||||
|
Formatter,
|
||||||
|
Display,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// `Result` for the transformation of a CSV record's field, either an `Option<String>` if
|
||||||
|
/// successfully transformed or a `TransformError` if unsuccessful.
|
||||||
|
pub type TransformResult = result::Result<Option<String>, TransformError>;
|
||||||
|
|
||||||
|
/// Helper trait with a few useful utility methods for constructing `TransformResult`.
|
||||||
|
pub trait TransformResultHelper
|
||||||
|
{
|
||||||
|
/// Construct a `TransformResult` that represents a successful transformation of a CSV record's
|
||||||
|
/// field with a non-empty value.
|
||||||
|
fn present(value: &str) -> TransformResult {
|
||||||
|
Ok(Some(value.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct a `TransformResult` that represents a successful tranformation of a CSV record's
|
||||||
|
/// field with an empty value.
|
||||||
|
fn excluded() -> TransformResult {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Construct a `TransformResult` that represents a failed transformation of a CSV record's
|
||||||
|
/// field with a descritive error reason.
|
||||||
|
///
|
||||||
|
/// An error reason should be a short, single sentence without punctuation or capitization,
|
||||||
|
/// e.g. "not a valid email address" instead of "The email address was invalid.".
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use csv_sanity::transformer::{
|
||||||
|
/// TransformResult,
|
||||||
|
/// TransformError,
|
||||||
|
/// TransformResultHelper,
|
||||||
|
/// };
|
||||||
|
///
|
||||||
|
/// let result = TransformResult::error("jak,.@hot mail.com", "Email", 0, "not a valid email address");
|
||||||
|
/// assert_eq!(result, Err(TransformError {
|
||||||
|
/// field_value: "jak,.@hot mail.com".to_string(),
|
||||||
|
/// field_name: "Email".to_string(),
|
||||||
|
/// record_n: 0,
|
||||||
|
/// reason: "not a valid email address".to_string(),
|
||||||
|
/// }));
|
||||||
|
/// ```
|
||||||
|
fn error(field_value: &str, field_name: &str, record_n: usize, reason: &str) -> TransformResult {
|
||||||
|
Err(
|
||||||
|
TransformError {
|
||||||
|
field_value: field_value.to_string(),
|
||||||
|
field_name: field_name.to_string(),
|
||||||
|
record_n: record_n,
|
||||||
|
reason: reason.to_string(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TransformResultHelper for TransformResult {}
|
||||||
|
|
||||||
|
pub trait Transformer
|
||||||
|
{
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(RustcEncodable, Deserialize, Serialize, Clone, PartialEq, Eq, Hash, Debug)]
|
||||||
|
pub struct TransformError
|
||||||
|
{
|
||||||
|
pub record_n: usize,
|
||||||
|
pub field_name: String,
|
||||||
|
pub field_value: String,
|
||||||
|
pub reason: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for TransformError
|
||||||
|
{
|
||||||
|
fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
|
||||||
|
write!(formatter, "failed to transform field: {}", self.reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl error::Error for TransformError
|
||||||
|
{
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
&self.reason
|
||||||
|
}
|
||||||
|
}
|
||||||
41
src/transformers/capitalize.rs
Normal file
41
src/transformers/capitalize.rs
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
|
pub fn capitalize(string: &str) -> String
|
||||||
|
{
|
||||||
|
string.unicode_words()
|
||||||
|
.map(capitalize_word).collect::<Vec<String>>()
|
||||||
|
.join(" ")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capitalize_word(word: &str) -> String
|
||||||
|
{
|
||||||
|
word.chars().enumerate()
|
||||||
|
.map(|(i, c)| if i == 0 { c.to_uppercase().collect::<String>() } else { c.to_lowercase().collect() })
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct CapitalizeTransformer {}
|
||||||
|
|
||||||
|
impl CapitalizeTransformer
|
||||||
|
{
|
||||||
|
pub fn new() -> CapitalizeTransformer
|
||||||
|
{
|
||||||
|
CapitalizeTransformer {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for CapitalizeTransformer
|
||||||
|
{
|
||||||
|
fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult
|
||||||
|
{
|
||||||
|
let result = capitalize(field_value);
|
||||||
|
TransformResult::present(&result)
|
||||||
|
}
|
||||||
|
}
|
||||||
37
src/transformers/choice.rs
Normal file
37
src/transformers/choice.rs
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct ChoiceTransformer {
|
||||||
|
choices: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ChoiceTransformer
|
||||||
|
{
|
||||||
|
pub fn new(choices: Vec<String>) -> ChoiceTransformer
|
||||||
|
{
|
||||||
|
ChoiceTransformer {
|
||||||
|
choices: choices,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for ChoiceTransformer
|
||||||
|
{
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult
|
||||||
|
{
|
||||||
|
if self.choices.contains(&field_value.to_string()) {
|
||||||
|
TransformResult::present(&field_value)
|
||||||
|
} else {
|
||||||
|
TransformResult::error(
|
||||||
|
field_value,
|
||||||
|
field_name,
|
||||||
|
record_n,
|
||||||
|
&format!("not in valid choices {:?}", self.choices)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
41
src/transformers/date.rs
Normal file
41
src/transformers/date.rs
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
|
||||||
|
use time::{
|
||||||
|
strptime
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct DateTransformer {
|
||||||
|
input_formats: Vec<String>,
|
||||||
|
output_format: String
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DateTransformer {
|
||||||
|
pub fn new(input_formats: Vec<String>, output_format: &str) -> DateTransformer {
|
||||||
|
DateTransformer {
|
||||||
|
input_formats: input_formats,
|
||||||
|
output_format: output_format.to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_iso8601_output(input_formats: Vec<String>) -> DateTransformer {
|
||||||
|
Self::new(input_formats, "%F")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for DateTransformer {
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||||
|
for format in self.input_formats.iter() {
|
||||||
|
if let Ok(time) = strptime(field_value, &format) {
|
||||||
|
return TransformResult::present(
|
||||||
|
&format!("{}", time.strftime(&self.output_format).unwrap())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
TransformResult::error(field_value, field_name, record_n, "unable to parse as date")
|
||||||
|
}
|
||||||
|
}
|
||||||
30
src/transformers/email.rs
Normal file
30
src/transformers/email.rs
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref EMAIL_REGEX: Regex = Regex::new(r"(?i)\A[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\z").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct EmailTransformer {}
|
||||||
|
|
||||||
|
impl EmailTransformer {
|
||||||
|
pub fn new() -> EmailTransformer {
|
||||||
|
EmailTransformer {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for EmailTransformer {
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||||
|
if EMAIL_REGEX.is_match(field_value) {
|
||||||
|
TransformResult::present(&field_value.to_lowercase())
|
||||||
|
} else {
|
||||||
|
TransformResult::error(field_value, field_name, record_n, "invalid email address")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
76
src/transformers/mod.rs
Normal file
76
src/transformers/mod.rs
Normal file
|
|
@ -0,0 +1,76 @@
|
||||||
|
use transformer::{
|
||||||
|
Transformer,
|
||||||
|
TransformResult,
|
||||||
|
};
|
||||||
|
|
||||||
|
mod trim;
|
||||||
|
pub use self::trim::TrimTransformer;
|
||||||
|
|
||||||
|
mod none;
|
||||||
|
pub use self::none::NoneTransformer;
|
||||||
|
|
||||||
|
mod regex;
|
||||||
|
pub use self::regex::{
|
||||||
|
RegexTransformer,
|
||||||
|
RegexMatchTransformer
|
||||||
|
};
|
||||||
|
|
||||||
|
mod capitalize;
|
||||||
|
pub use self::capitalize::{
|
||||||
|
CapitalizeTransformer,
|
||||||
|
capitalize
|
||||||
|
};
|
||||||
|
|
||||||
|
mod email;
|
||||||
|
pub use self::email::EmailTransformer;
|
||||||
|
|
||||||
|
mod number;
|
||||||
|
pub use self::number::NumberTransformer;
|
||||||
|
|
||||||
|
mod date;
|
||||||
|
pub use self::date::DateTransformer;
|
||||||
|
|
||||||
|
mod choice;
|
||||||
|
pub use self::choice::ChoiceTransformer;
|
||||||
|
|
||||||
|
mod zipcode;
|
||||||
|
pub use self::zipcode::ZipcodeTransformer;
|
||||||
|
|
||||||
|
mod phone_number;
|
||||||
|
pub use self::phone_number::PhoneNumberTransformer;
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub enum Transformers
|
||||||
|
{
|
||||||
|
Trim(TrimTransformer),
|
||||||
|
None(NoneTransformer),
|
||||||
|
Regex(RegexTransformer),
|
||||||
|
RegexMatch(RegexMatchTransformer),
|
||||||
|
Capitalize(CapitalizeTransformer),
|
||||||
|
Email(EmailTransformer),
|
||||||
|
Number(NumberTransformer),
|
||||||
|
Date(DateTransformer),
|
||||||
|
Choice(ChoiceTransformer),
|
||||||
|
Zipcode(ZipcodeTransformer),
|
||||||
|
PhoneNumber(PhoneNumberTransformer),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for Transformers {
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||||
|
use self::Transformers::*;
|
||||||
|
|
||||||
|
match *self {
|
||||||
|
Trim(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
None(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
Regex(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
RegexMatch(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
Capitalize(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
Email(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
Number(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
Date(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
Choice(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
Zipcode(ref t) => t.transform(field_value, field_name, record_n),
|
||||||
|
PhoneNumber(ref t) => t.transform(field_value, field_name, record_n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
34
src/transformers/none.rs
Normal file
34
src/transformers/none.rs
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
use newtypes::Regex;
|
||||||
|
|
||||||
|
use regex;
|
||||||
|
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct NoneTransformer {
|
||||||
|
regex: Regex
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NoneTransformer {
|
||||||
|
pub fn new(regex: regex::Regex) -> NoneTransformer {
|
||||||
|
NoneTransformer { regex: Regex::from(regex) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_blank_matcher() -> NoneTransformer {
|
||||||
|
Self::new(regex::Regex::new(r"\A(?:[:cntrl:]|\s)*\z").unwrap())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for NoneTransformer {
|
||||||
|
fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult {
|
||||||
|
if self.regex.is_match(field_value) {
|
||||||
|
TransformResult::excluded()
|
||||||
|
} else {
|
||||||
|
TransformResult::present(field_value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
30
src/transformers/number.rs
Normal file
30
src/transformers/number.rs
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref INTEGER_REGEX: Regex = Regex::new(r"\A(:?0|[1-9]\d*)\z").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct NumberTransformer { }
|
||||||
|
|
||||||
|
impl NumberTransformer {
|
||||||
|
pub fn match_integer() -> NumberTransformer {
|
||||||
|
NumberTransformer { }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for NumberTransformer {
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||||
|
if INTEGER_REGEX.is_match(field_value) {
|
||||||
|
TransformResult::present(field_value)
|
||||||
|
} else {
|
||||||
|
TransformResult::error(field_value, field_name, record_n, "not a valid number")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
34
src/transformers/phone_number.rs
Normal file
34
src/transformers/phone_number.rs
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref NANP_REGEX: Regex = Regex::new(r"\A(?:\+?1)?\D*\(?(?P<area>\d{3})\)?\D*(?P<exchange>\d{3})\D*(?P<subscriber>\d{4})\z").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct PhoneNumberTransformer { }
|
||||||
|
|
||||||
|
impl PhoneNumberTransformer {
|
||||||
|
pub fn expect_nanp_format() -> PhoneNumberTransformer {
|
||||||
|
PhoneNumberTransformer { }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for PhoneNumberTransformer {
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||||
|
if let Some(captures) = NANP_REGEX.captures(field_value) {
|
||||||
|
let area_code = captures.name("area").unwrap().as_str();
|
||||||
|
let exchange_code = captures.name("exchange").unwrap().as_str();
|
||||||
|
let subscriber_number = captures.name("subscriber").unwrap().as_str();
|
||||||
|
let phone_number = format!("+1 {} {} {}", area_code, exchange_code, subscriber_number);
|
||||||
|
TransformResult::present(&phone_number)
|
||||||
|
} else {
|
||||||
|
TransformResult::error(field_value, field_name, record_n, "not a valid NANP format phone number")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
88
src/transformers/regex.rs
Normal file
88
src/transformers/regex.rs
Normal file
|
|
@ -0,0 +1,88 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
use newtypes::Regex;
|
||||||
|
|
||||||
|
use regex;
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct RegexTransformer
|
||||||
|
{
|
||||||
|
regex: Regex,
|
||||||
|
template: String
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexTransformer
|
||||||
|
{
|
||||||
|
pub fn new(regex: regex::Regex, template: &str) -> RegexTransformer {
|
||||||
|
RegexTransformer {
|
||||||
|
regex: Regex::from(regex),
|
||||||
|
template: template.to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for RegexTransformer
|
||||||
|
{
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||||
|
if let Some(captures) = self.regex.captures(field_value) {
|
||||||
|
let mut expansion = String::new();
|
||||||
|
captures.expand(&self.template, &mut expansion);
|
||||||
|
TransformResult::present(&expansion)
|
||||||
|
} else {
|
||||||
|
TransformResult::error(
|
||||||
|
field_value,
|
||||||
|
field_name,
|
||||||
|
record_n,
|
||||||
|
&format!("did not match pattern {}", self.regex)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct RegexMatchTransformer
|
||||||
|
{
|
||||||
|
regex: Regex,
|
||||||
|
negate: bool
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexMatchTransformer
|
||||||
|
{
|
||||||
|
pub fn matching(regex: regex::Regex) -> RegexMatchTransformer {
|
||||||
|
RegexMatchTransformer {
|
||||||
|
regex: Regex::from(regex),
|
||||||
|
negate: false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn not_matching(regex: regex::Regex) -> RegexMatchTransformer {
|
||||||
|
RegexMatchTransformer {
|
||||||
|
regex: Regex::from(regex),
|
||||||
|
negate: true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for RegexMatchTransformer
|
||||||
|
{
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||||
|
let mut is_match = self.regex.is_match(field_value);
|
||||||
|
if self.negate {
|
||||||
|
is_match = !is_match;
|
||||||
|
}
|
||||||
|
|
||||||
|
if is_match {
|
||||||
|
TransformResult::present(field_value)
|
||||||
|
} else {
|
||||||
|
let reason = if self.negate {
|
||||||
|
format!("matched exclusionary pattern {}", self.regex)
|
||||||
|
} else {
|
||||||
|
format!("did not match pattern {}", self.regex)
|
||||||
|
};
|
||||||
|
TransformResult::error(field_value, field_name, record_n, &reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
20
src/transformers/trim.rs
Normal file
20
src/transformers/trim.rs
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct TrimTransformer {}
|
||||||
|
|
||||||
|
impl TrimTransformer {
|
||||||
|
pub fn new() -> TrimTransformer {
|
||||||
|
TrimTransformer {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for TrimTransformer {
|
||||||
|
fn transform(&self, field_value: &str, _: &str, _: usize) -> TransformResult {
|
||||||
|
TransformResult::present(field_value.trim())
|
||||||
|
}
|
||||||
|
}
|
||||||
37
src/transformers/zipcode.rs
Normal file
37
src/transformers/zipcode.rs
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
use Transformer;
|
||||||
|
use transformer::{
|
||||||
|
TransformResultHelper,
|
||||||
|
TransformResult
|
||||||
|
};
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref ZIP_REGEX: Regex = Regex::new(r"\A(\d{5})\D*(?:(\d{4}))?\z").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Hash, Clone, Debug)]
|
||||||
|
pub struct ZipcodeTransformer { }
|
||||||
|
|
||||||
|
impl ZipcodeTransformer {
|
||||||
|
pub fn new() -> ZipcodeTransformer {
|
||||||
|
ZipcodeTransformer { }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Transformer for ZipcodeTransformer {
|
||||||
|
fn transform(&self, field_value: &str, field_name: &str, record_n: usize) -> TransformResult {
|
||||||
|
if let Some(captures) = ZIP_REGEX.captures(field_value) {
|
||||||
|
let base_code = captures.get(1).unwrap();
|
||||||
|
let plus_four_code = captures.get(2);
|
||||||
|
let zipcode = if let Some(pfc) = plus_four_code {
|
||||||
|
format!("{}-{}", base_code.as_str(), pfc.as_str())
|
||||||
|
} else {
|
||||||
|
base_code.as_str().to_string()
|
||||||
|
};
|
||||||
|
TransformResult::present(&zipcode)
|
||||||
|
} else {
|
||||||
|
TransformResult::error(field_value, field_name, record_n, "not a valid zipcode")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue