paigen11 · BobCochran · Oct 6, 2019 · Oct 6, 2019 · Oct 6, 2019 · Oct 6, 2019
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,101 @@
-itcont.txt
-node_modules/
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# TypeScript v1 declaration files
+typings/
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variables file
+.env
+.env.test
+
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+
+# next.js build output
+.next
+
+# nuxt.js build output
+.nuxt
+
+# gatsby files
+.cache/
+public
+
+# vuepress build output
+.vuepress/dist
+
+# Serverless directories
+.serverless/
+
+# FuseBox cache
+.fusebox/
+
+# DynamoDB Local files
+.dynamodb/
+
+# Reformatted output data
+database/mongodb_version4/reformatted/
diff --git a/.vscode/launch.json b/.vscode/launch.json
diff --git a/.vscode/settings.json b/.vscode/settings.json
diff --git a/README.md b/README.md
@@ -1,18 +1,37 @@
-# Node.js Large File / Data Reading & Performance Testing
+# Node.js Read Large Files Challenge
 
-This is an example of 3 different ways to use Node.js to process big data files. One file is the Node.js' `fs.readFile()`, another is with Node.js' `fs.createReadSteam()`, and the final is with the help of the NPM module `EventStream`.
+The challenge is to efficiently process a really large text file sourced from the Federal Election Commission. The input data consists of records of monetary contributions by individuals to poltitical entities. 
+
+Code provided in this repository is in the form of Node.js scripts. They showcase 3 different approaches to process big data files. One script utilizes the Node.js `fs.readFile()` API, another utilizes `fs.createReadSteam()`, and the final script incorporates the external NPM module `EventStream`.
 
-There is also use of `console.time` and `console.timeEnd` to determine the performance of the 3 different implementations, and which is most efficient processing the files.
+## Performance Testing of the Different Large File Reading Strategies
+
+`console.time` and `console.timeEnd` are used to determine the performance of the 3 different implementations, and which is most efficient processing of the input files.
+
+### To Download the Really Large FEC File
+
+The text file to be processed consists of records of politcal campaign contributions by individuals during the 2018 election cycle.
 
-### To Download the Really Large File
 Download the large file zip here: https://www.fec.gov/files/bulk-downloads/2018/indiv18.zip
 
-The main file in the zip: `itcont.txt`, can only be processed by the `readFileEventStream.js` file, the other two implementations can't handle the 2.55GB file size in memory (Node.js can only hold about 1.5GB in memory at one time).*
+### To Download the Dictionary and Header Files 
+
+The indiv18.zip contains files which are essentially in a comma separated values style. There are 21 fields. To make sense of them, you need to get additional files from the data_dictionaries folder. A "Documentation" folder is provided which contains the two files listed below. However, these files apply to the 2018 election data. If the file layouts have changed in subsequent election years, you will need to download the correct ones for the election cycle you are processing. Generally, you will want to  download from the Federal Election Commission "bulk downloads" site. The data_dictionaries folder should be checked for files named like the below. Download them if needed:
+
+bulk-downloads/data_dictionaries/indiv_dictionary.txt
+
+bulk-downloads/data_dictionaries/indiv_header_file.csv
 
-*Caveat: You can override the standard Node memory limit using the CLI arugment `max-old-space-size=XYZ`. To run, pass in `node --max-old-space-size=8192 <FILE NAME>.js` (this will increase Node's memory limit to 8gb - just be careful not to make it too large that Node kills off other processes or crashes because its run out of memory)
+dictionary.txt explains the data provided in each field of a contribution record. header_file.csv is formatted as a header record in comma separated values format, with one heading for each field provided in the contribution record.
+
+The indiv18.zip file contains several files in the archive, some of which are quite large. The zip file alone can take 5+ minutes to download, depending on connection speed. 
+
+The main file in the zip archive: `itcont.txt`, is the largest in size at 2.55 GiB. It can only be processed by the `readFileEventStream.js` script file. The other two scripts in this repository can't handle the input file size in memory. Node.js can only hold about 1.5GB in memory at one time.*
+
+*Caveat: You can override the standard Node memory limit using the CLI arugment `max-old-space-size=XYZ`. To run, pass in `node --max-old-space-size=8192 <FILE NAME>.js` This will increase Node's memory limit to 8 GiB - just be careful not to make the value so large that Node kills off other processes or crashes because it runs out of memory.
 
 ### To Run
-Before the first run, run `npm install` from the command line to install the `event-stream` and `performance.now` packages from Node.
+Before the first run, run `npm install` from the command line to install the `event-stream` and `performance.now` packages from Node. You may want to check the package.json file to adjust which versions of the external modules you are installing.
 
 Add the file path for one of the files (could be the big one `itcont.txt` or any of its smaller siblings in the `indiv18` folder that were just downloaded), and type the command `node <FILE_NAME_TO_RUN>` in the command line.
 
@@ -21,5 +40,34 @@ Then you'll see the answers required from the file printed out to the terminal.
 ### To Check Performance Testing
 Use one of the smaller files contained within the `indiv18` folder - they're all about 400MB and can be used with all 3 implementations. Run those along with the `console.time` and `performance.now()` references and you can see which solution is more performant and by how much.
 
+### Option: Put FEC Contribution Records in a MongoDB v4.x Database Collection
+It is possible to reformat the input records to a Javascript Object Notation (JSON) format compatible with MongoDB database version 4.x. You must do some additional preparation work. The instructions here assume you are familiar with the Linux command line and Linux-based utilities such as sed and egrep.
+
+Download and unzip the indiv18.zip file. Download the header file noted above. Make note of the path where you unzipped the contribution files to.
+The header file is in comma separated values format, using actual commas ',' as the separator. You must change the separator to a pipe symbol '|'.
+
+`sed 's/\,/\|/g' < indiv_header_file.csv > test1.csv`
+
+You must append individual contribution records to this test1.csv file. For testing purposes, use egrep to extract records of interest, such as contributors employed by particular companies.
+
+`egrep 'PFIZER' itcont_2018_20181228_52010302.txt >> test1.csv`
+
+Navigate to the database/mongodb_version4 folder.
+
+Create a new folder named 'reformatted' in that folder.
+
+On the command line, issue 
+
+`node reformat_fec_data_to_json.js path/to/your/test1.csv`
+
+The input file test1.csv is reformatted to json and the output file is in the reformatted/ folder that you created. It will have a *.json extension. You can change the name of the output file by changing the writeStream arguments in the reformat_fec_data_to_json.js script.
+
+You can then import this reformatted data into a MongoDB version 4.x collection using the mongoimport utility, like so:
+
+`mongoimport --db fecdata --collection t1 --file reformatted/test1.json`
+
+The advantage of loading this data into a MongoDB collection is that you can then perform aggregation queries on the collection using the db.collection.aggregate() utility of MongoDB. You can also index the collection as you prefer.
+
+Contributor BobCochran has only tested the script with 271,237 input records. To test the reformatting, Node.js versions 10.16.3 and 12.3.0 were used. The reformatted data was added to a standalone instance of MongoDB Enterprise server version 4.0.13, running in a Ubuntu version 18.04.3 LTS server.
 
 
diff --git a/database/mongodb_version4/reformat_fec_data_to_json.js b/database/mongodb_version4/reformat_fec_data_to_json.js
@@ -0,0 +1,155 @@
+/* This code uses the Node.js readline API to read political campaign
+ * donation data obtained from the United States Federal Election 
+ * Commission (the "FEC".) Each line of input from the *.txt file 
+ * is reformatted into an output record that is in JSON format, and 
+ * uses the specific data types documented by the MongoDB version 4.x 
+ * database server. 
+ *
+ * The specific input files being reformatted by this code are the
+ * SEC records of political donations by individuals, of USD $200.00
+ * or more. For example, the "indiv20.zip" file in the FEC bulk 
+ * downloads area contains multiple *.txt files, each of which records
+ * a political donation of $200.00 or more by a named individual. 
+ * The record layout is provided in the "documentation" folder appearing
+ * at the root folder of this repository.
+ *
+ */
+
+const fs = require('fs');
+const readline = require('readline');
+
+//Count number of lines
+
+var lineCount = 0;
+
+//An array that holds the header line of the csv file.
+var myHdr = [];
+
+const rl = readline.createInterface({
+
+  input: fs.createReadStream(process.argv[2]),
+
+  crlfDelay: Infinity
+
+});
+
+// Create a writeStream so we can write the reformatted output to a file 
+
+const writeStream = fs.createWriteStream( "./reformatted/test1.json", { encoding: "utf8"} );
+
+// Split and save the first line -- treat that as the header line.
+
+rl.on('line', (line) => {
+
+  lineCount++
+
+  if (lineCount === 1) {
+
+        /* Code by the original author splits a line using a 
+         * technique like this:
+         *
+	 *        myHdr = line.split('|')[3]
+         *
+         * It has the effect of skipping the first 3 elements and
+         * capturing the fourth element -- and only the fourth. 
+         * What I wish to do is different: split every field out,
+         * in order to reformat them into json-ified records.
+         */
+
+        myHdr = line.split('|')
+
+	console.log('Elements from the header line are ' + myHdr)
+
+  }
+
+  if (lineCount > 1) {
+
+	var myTrans = line.split('|')
+
+	var jstring = "{ "
+
+	for (i = 0; i < 21; i++) {
+
+		/* The 13th index value is the transaction date. This needs to be reformated
+		 * from a MMDDYYYY string to a YYYY-MM-DD string that can be converted to 
+                 * ISO8601 date format acceptable to the MongoDB 'mongoimport' utility.
+                 */
+
+		if (i === 13) {
+
+			var myDateStr = myTrans[i]
+
+			var theISODt = "ISODate\(\"" + myDateStr[4] + myDateStr[5] + myDateStr[6] + myDateStr[7] + "\-" + myDateStr[0] + myDateStr[1]  +  "\-" 
+
+			theISODt = theISODt + myDateStr[2] + myDateStr[3] + "T00\:00\:00Z\"" + "\)"
+
+			jstring = jstring + "\"" + myHdr[i] + "\" : " + theISODt + "\, "
+
+		}
+
+		/* The 14th index value is the transaction amount field. Reformat this into a 
+		 * $numberDecimal value (also known as Decimal128.) The value has to be formatted
+                 * like so: "TRANSACTION_AMT" : {"$numberDecimal" : "120.00"} 
+                 */
+
+		else if (i === 14) {
+
+			var myAmt = myTrans[i]
+
+			/* Is the amount field a real number? */
+
+			if (myAmt !== "") {
+
+				var theContr = "\{\"\$numberDecimal\" \: \"" + myAmt + "\.00\"\}"
+
+				jstring = jstring + "\"" + myHdr[i] + "\" : " + theContr + "\, "
+
+//				console.log("The myTrans array " + myTrans)
+
+//				console.log("The myAmt value " + myAmt)
+
+//				console.log("The typeof for myAmt " + typeof myAmt)
+
+			} else {
+
+
+				var theContr = "\{\"\$numberDecimal\" \: \"0" + "\.00\"\}"
+
+				jstring = jstring + "\"" + myHdr[i] + "\" : " + theContr + "\, "
+
+
+			} 
+
+		} 
+
+		/* The 20th index value is the final field to be reformatted. We want to close the
+		 * string with a valid JSON closing brace.
+                 */
+
+		else if (i === 20) {
+
+		jstring = jstring + "\"" + myHdr[i] + "\" : " + "\"" + myTrans[i] + "\"" + " \}"
+
+		} else {
+
+		jstring = jstring + "\"" + myHdr[i] + "\" : " + "\"" + myTrans[i] + "\"\, "
+
+	}
+
+ } 
+
+//	console.log(jstring)
+
+	writeStream.write(jstring)
+
+} } );
+
+rl.on('close', () => {
+
+  console.log('Number of lines processed is ' + lineCount)
+
+})
+
+function isEmptyOrSpaces(str){
+    return str === null || str.match(/^ *$/) !== null;
+}
diff --git a/database/mongodb_version4/test_array2.js b/database/mongodb_version4/test_array2.js
@@ -0,0 +1,36 @@
+function splitString(stringToSplit, separator) {
+  const arrayOfStrings = stringToSplit.split(separator);
+
+  console.log('The original string is: "' + stringToSplit + '"');
+  console.log('The separator is: "' + separator + '"');
+
+  if (arrayOfStrings[14] === "") {
+		arrayOfStrings[14] = 0
+		console.log("The transaction amount has been replaced.")
+	}
+
+  if (arrayOfStrings.includes(undefined)) {
+
+		console.log("There are undefined or empty elements in the arrayOfStrings")
+  }
+  console.log("The Object.values " + Object.values(arrayOfStrings))
+  console.log(Object.values(arrayOfStrings).length)
+  console.log(arrayOfStrings.length) 
+  console.log('The array has ' + arrayOfStrings.length + ' elements: ' + arrayOfStrings.join('/'));
+}
+
+const tempestString = 'Oh brave new world that has such people in it.';
+const monthString = 'Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec';
+const monthString2 = 'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep||Nov|Dec';
+const fecString = 'C00339655|N|YE|P|201901179143856769|15|IND|COCHRAN, ERNEST W|PARIS|TX|754606333|TEXAS ONCOLOGY, P.A.|PHYSICIAN SHAREHOLDER MED ONC|12312018|||201901021615-165|1305336|||4021920191640570973'
+
+const space = ' ';
+const comma = ',';
+const pipe = '|'
+
+//splitString(tempestString, space);
+//splitString(tempestString);
+//splitString(monthString2, pipe);
+splitString(fecString, pipe)
+
+