Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
2e268fd
Create documentation folder. Add dictionary downloaded from FEC.
BobCochran Oct 6, 2019
f5062c1
Added header file to documentation folder.
BobCochran Oct 6, 2019
193d1ee
Expanded the .gitignore coverage
BobCochran Oct 6, 2019
507bbc4
Removed .vscode directory, not needed for Linux and probably needs to…
BobCochran Oct 6, 2019
98964ae
Start to reformat FEC individual donation input to json output.
BobCochran Oct 6, 2019
57c043d
Rename the node script, then put in the initial code.
BobCochran Oct 6, 2019
8409838
Split the header line.
BobCochran Oct 6, 2019
5544c01
Test whether Niedringhaus splitting code works.
BobCochran Oct 6, 2019
467861e
Split the full header line.
BobCochran Oct 6, 2019
17c968e
Split the transaction line.
BobCochran Oct 6, 2019
45137da
Start actual reformat to json.
BobCochran Oct 6, 2019
1657a24
Build the json output.
BobCochran Oct 6, 2019
4cb0105
Build the json output.
BobCochran Oct 6, 2019
95147bb
Build the json output.
BobCochran Oct 6, 2019
c7739df
Build the json output.
BobCochran Oct 6, 2019
53526a2
Build the json output.
BobCochran Oct 6, 2019
f14083a
Build the json output.
BobCochran Oct 6, 2019
eb9910c
Continue reformatting.
BobCochran Oct 9, 2019
d67b5f0
Correctly reformat transaction date to an ISO8601 date value that mon…
BobCochran Oct 9, 2019
a35efd8
Add ISODate value to json string.
BobCochran Oct 9, 2019
892ceb9
Reformatting of a single transaction is complete.
BobCochran Oct 10, 2019
71d5d1f
Correctly reformats one transaction to JSON.
BobCochran Oct 10, 2019
4520c92
Start processing transaction amount.
BobCochran Oct 10, 2019
7ea9898
Continue reformatting money amount
BobCochran Oct 10, 2019
7aab2aa
Reformat the contribution amount into decimal128.
BobCochran Oct 10, 2019
a0a296e
Try to process multiple lines of input.
BobCochran Oct 10, 2019
d72eb87
Write reformatted output to a file.
BobCochran Oct 10, 2019
67e3be7
Reformat a larger file containing donations from people employed by P…
BobCochran Oct 10, 2019
2b48ed2
Edit README file.
BobCochran Oct 11, 2019
d8a3b58
Fix README formatting.
BobCochran Oct 11, 2019
559dde7
Edit README.md document.
BobCochran Oct 11, 2019
3b7b2fa
Edit README.md file.
BobCochran Oct 11, 2019
fdd770f
Fix incorrect egrep example.
BobCochran Oct 11, 2019
8f40bf2
Continue updating the README.md
BobCochran Oct 11, 2019
8a20c59
Test transaction amount for a numeric value.
BobCochran Oct 11, 2019
cd4983b
Try to catch nonumeric transaction amounts.
BobCochran Oct 11, 2019
1d7d358
Try to catch nonumeric transaction amounts.
BobCochran Oct 11, 2019
20571a6
Correctly test for an empty string in the transaction amount field.
BobCochran Oct 26, 2019
8d8b1f5
Small program to split an inspect a line with no contribution amount.
BobCochran Oct 26, 2019
16b224e
Test of large input file; comment out console.log statements.
BobCochran Oct 26, 2019
7120a77
Edit to indicate number of records tested, and indicate Node.js and M…
BobCochran Oct 27, 2019
e4b8f85
Change writeStream to output to test1.json
BobCochran Oct 27, 2019
f8a2981
Change reference to BobCochran as contributor.
BobCochran Oct 27, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 101 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,101 @@
itcont.txt
node_modules/
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# TypeScript v1 declaration files
typings/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.env.test

# parcel-bundler cache (https://parceljs.org/)
.cache

# next.js build output
.next

# nuxt.js build output
.nuxt

# gatsby files
.cache/
public

# vuepress build output
.vuepress/dist

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# Reformatted output data
database/mongodb_version4/reformatted/
14 changes: 0 additions & 14 deletions .vscode/launch.json

This file was deleted.

2 changes: 0 additions & 2 deletions .vscode/settings.json

This file was deleted.

62 changes: 55 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,37 @@
# Node.js Large File / Data Reading & Performance Testing
# Node.js Read Large Files Challenge

This is an example of 3 different ways to use Node.js to process big data files. One file is the Node.js' `fs.readFile()`, another is with Node.js' `fs.createReadSteam()`, and the final is with the help of the NPM module `EventStream`.
The challenge is to efficiently process a really large text file sourced from the Federal Election Commission. The input data consists of records of monetary contributions by individuals to poltitical entities.

Code provided in this repository is in the form of Node.js scripts. They showcase 3 different approaches to process big data files. One script utilizes the Node.js `fs.readFile()` API, another utilizes `fs.createReadSteam()`, and the final script incorporates the external NPM module `EventStream`.

There is also use of `console.time` and `console.timeEnd` to determine the performance of the 3 different implementations, and which is most efficient processing the files.
## Performance Testing of the Different Large File Reading Strategies

`console.time` and `console.timeEnd` are used to determine the performance of the 3 different implementations, and which is most efficient processing of the input files.

### To Download the Really Large FEC File

The text file to be processed consists of records of politcal campaign contributions by individuals during the 2018 election cycle.

### To Download the Really Large File
Download the large file zip here: https://www.fec.gov/files/bulk-downloads/2018/indiv18.zip

The main file in the zip: `itcont.txt`, can only be processed by the `readFileEventStream.js` file, the other two implementations can't handle the 2.55GB file size in memory (Node.js can only hold about 1.5GB in memory at one time).*
### To Download the Dictionary and Header Files

The indiv18.zip contains files which are essentially in a comma separated values style. There are 21 fields. To make sense of them, you need to get additional files from the data_dictionaries folder. A "Documentation" folder is provided which contains the two files listed below. However, these files apply to the 2018 election data. If the file layouts have changed in subsequent election years, you will need to download the correct ones for the election cycle you are processing. Generally, you will want to download from the Federal Election Commission "bulk downloads" site. The data_dictionaries folder should be checked for files named like the below. Download them if needed:

bulk-downloads/data_dictionaries/indiv_dictionary.txt

bulk-downloads/data_dictionaries/indiv_header_file.csv

*Caveat: You can override the standard Node memory limit using the CLI arugment `max-old-space-size=XYZ`. To run, pass in `node --max-old-space-size=8192 <FILE NAME>.js` (this will increase Node's memory limit to 8gb - just be careful not to make it too large that Node kills off other processes or crashes because its run out of memory)
dictionary.txt explains the data provided in each field of a contribution record. header_file.csv is formatted as a header record in comma separated values format, with one heading for each field provided in the contribution record.

The indiv18.zip file contains several files in the archive, some of which are quite large. The zip file alone can take 5+ minutes to download, depending on connection speed.

The main file in the zip archive: `itcont.txt`, is the largest in size at 2.55 GiB. It can only be processed by the `readFileEventStream.js` script file. The other two scripts in this repository can't handle the input file size in memory. Node.js can only hold about 1.5GB in memory at one time.*

*Caveat: You can override the standard Node memory limit using the CLI arugment `max-old-space-size=XYZ`. To run, pass in `node --max-old-space-size=8192 <FILE NAME>.js` This will increase Node's memory limit to 8 GiB - just be careful not to make the value so large that Node kills off other processes or crashes because it runs out of memory.

### To Run
Before the first run, run `npm install` from the command line to install the `event-stream` and `performance.now` packages from Node.
Before the first run, run `npm install` from the command line to install the `event-stream` and `performance.now` packages from Node. You may want to check the package.json file to adjust which versions of the external modules you are installing.

Add the file path for one of the files (could be the big one `itcont.txt` or any of its smaller siblings in the `indiv18` folder that were just downloaded), and type the command `node <FILE_NAME_TO_RUN>` in the command line.

Expand All @@ -21,5 +40,34 @@ Then you'll see the answers required from the file printed out to the terminal.
### To Check Performance Testing
Use one of the smaller files contained within the `indiv18` folder - they're all about 400MB and can be used with all 3 implementations. Run those along with the `console.time` and `performance.now()` references and you can see which solution is more performant and by how much.

### Option: Put FEC Contribution Records in a MongoDB v4.x Database Collection
It is possible to reformat the input records to a Javascript Object Notation (JSON) format compatible with MongoDB database version 4.x. You must do some additional preparation work. The instructions here assume you are familiar with the Linux command line and Linux-based utilities such as sed and egrep.

Download and unzip the indiv18.zip file. Download the header file noted above. Make note of the path where you unzipped the contribution files to.
The header file is in comma separated values format, using actual commas ',' as the separator. You must change the separator to a pipe symbol '|'.

`sed 's/\,/\|/g' < indiv_header_file.csv > test1.csv`

You must append individual contribution records to this test1.csv file. For testing purposes, use egrep to extract records of interest, such as contributors employed by particular companies.

`egrep 'PFIZER' itcont_2018_20181228_52010302.txt >> test1.csv`

Navigate to the database/mongodb_version4 folder.

Create a new folder named 'reformatted' in that folder.

On the command line, issue

`node reformat_fec_data_to_json.js path/to/your/test1.csv`

The input file test1.csv is reformatted to json and the output file is in the reformatted/ folder that you created. It will have a *.json extension. You can change the name of the output file by changing the writeStream arguments in the reformat_fec_data_to_json.js script.

You can then import this reformatted data into a MongoDB version 4.x collection using the mongoimport utility, like so:

`mongoimport --db fecdata --collection t1 --file reformatted/test1.json`

The advantage of loading this data into a MongoDB collection is that you can then perform aggregation queries on the collection using the db.collection.aggregate() utility of MongoDB. You can also index the collection as you prefer.

Contributor BobCochran has only tested the script with 271,237 input records. To test the reformatting, Node.js versions 10.16.3 and 12.3.0 were used. The reformatted data was added to a standalone instance of MongoDB Enterprise server version 4.0.13, running in a Ubuntu version 18.04.3 LTS server.


155 changes: 155 additions & 0 deletions database/mongodb_version4/reformat_fec_data_to_json.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/* This code uses the Node.js readline API to read political campaign
* donation data obtained from the United States Federal Election
* Commission (the "FEC".) Each line of input from the *.txt file
* is reformatted into an output record that is in JSON format, and
* uses the specific data types documented by the MongoDB version 4.x
* database server.
*
* The specific input files being reformatted by this code are the
* SEC records of political donations by individuals, of USD $200.00
* or more. For example, the "indiv20.zip" file in the FEC bulk
* downloads area contains multiple *.txt files, each of which records
* a political donation of $200.00 or more by a named individual.
* The record layout is provided in the "documentation" folder appearing
* at the root folder of this repository.
*
*/

const fs = require('fs');
const readline = require('readline');

//Count number of lines

var lineCount = 0;

//An array that holds the header line of the csv file.
var myHdr = [];

const rl = readline.createInterface({

input: fs.createReadStream(process.argv[2]),

crlfDelay: Infinity

});

// Create a writeStream so we can write the reformatted output to a file

const writeStream = fs.createWriteStream( "./reformatted/test1.json", { encoding: "utf8"} );

// Split and save the first line -- treat that as the header line.

rl.on('line', (line) => {

lineCount++

if (lineCount === 1) {

/* Code by the original author splits a line using a
* technique like this:
*
* myHdr = line.split('|')[3]
*
* It has the effect of skipping the first 3 elements and
* capturing the fourth element -- and only the fourth.
* What I wish to do is different: split every field out,
* in order to reformat them into json-ified records.
*/

myHdr = line.split('|')

console.log('Elements from the header line are ' + myHdr)

}

if (lineCount > 1) {

var myTrans = line.split('|')

var jstring = "{ "

for (i = 0; i < 21; i++) {

/* The 13th index value is the transaction date. This needs to be reformated
* from a MMDDYYYY string to a YYYY-MM-DD string that can be converted to
* ISO8601 date format acceptable to the MongoDB 'mongoimport' utility.
*/

if (i === 13) {

var myDateStr = myTrans[i]

var theISODt = "ISODate\(\"" + myDateStr[4] + myDateStr[5] + myDateStr[6] + myDateStr[7] + "\-" + myDateStr[0] + myDateStr[1] + "\-"

theISODt = theISODt + myDateStr[2] + myDateStr[3] + "T00\:00\:00Z\"" + "\)"

jstring = jstring + "\"" + myHdr[i] + "\" : " + theISODt + "\, "

}

/* The 14th index value is the transaction amount field. Reformat this into a
* $numberDecimal value (also known as Decimal128.) The value has to be formatted
* like so: "TRANSACTION_AMT" : {"$numberDecimal" : "120.00"}
*/

else if (i === 14) {

var myAmt = myTrans[i]

/* Is the amount field a real number? */

if (myAmt !== "") {

var theContr = "\{\"\$numberDecimal\" \: \"" + myAmt + "\.00\"\}"

jstring = jstring + "\"" + myHdr[i] + "\" : " + theContr + "\, "

// console.log("The myTrans array " + myTrans)

// console.log("The myAmt value " + myAmt)

// console.log("The typeof for myAmt " + typeof myAmt)

} else {


var theContr = "\{\"\$numberDecimal\" \: \"0" + "\.00\"\}"

jstring = jstring + "\"" + myHdr[i] + "\" : " + theContr + "\, "


}

}

/* The 20th index value is the final field to be reformatted. We want to close the
* string with a valid JSON closing brace.
*/

else if (i === 20) {

jstring = jstring + "\"" + myHdr[i] + "\" : " + "\"" + myTrans[i] + "\"" + " \}"

} else {

jstring = jstring + "\"" + myHdr[i] + "\" : " + "\"" + myTrans[i] + "\"\, "

}

}

// console.log(jstring)

writeStream.write(jstring)

} } );

rl.on('close', () => {

console.log('Number of lines processed is ' + lineCount)

})

function isEmptyOrSpaces(str){
return str === null || str.match(/^ *$/) !== null;
}
36 changes: 36 additions & 0 deletions database/mongodb_version4/test_array2.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
function splitString(stringToSplit, separator) {
const arrayOfStrings = stringToSplit.split(separator);

console.log('The original string is: "' + stringToSplit + '"');
console.log('The separator is: "' + separator + '"');

if (arrayOfStrings[14] === "") {
arrayOfStrings[14] = 0
console.log("The transaction amount has been replaced.")
}

if (arrayOfStrings.includes(undefined)) {

console.log("There are undefined or empty elements in the arrayOfStrings")
}
console.log("The Object.values " + Object.values(arrayOfStrings))
console.log(Object.values(arrayOfStrings).length)
console.log(arrayOfStrings.length)
console.log('The array has ' + arrayOfStrings.length + ' elements: ' + arrayOfStrings.join('/'));
}

const tempestString = 'Oh brave new world that has such people in it.';
const monthString = 'Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec';
const monthString2 = 'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep||Nov|Dec';
const fecString = 'C00339655|N|YE|P|201901179143856769|15|IND|COCHRAN, ERNEST W|PARIS|TX|754606333|TEXAS ONCOLOGY, P.A.|PHYSICIAN SHAREHOLDER MED ONC|12312018|||201901021615-165|1305336|||4021920191640570973'

const space = ' ';
const comma = ',';
const pipe = '|'

//splitString(tempestString, space);
//splitString(tempestString);
//splitString(monthString2, pipe);
splitString(fecString, pipe)


Loading