Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,8 @@ lib/
.eslintcache

# Misc
.DS_Store
.DS_Store

# Local datasets
csvs
old_csvs
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ If you are on a GHEC instance (including EMU), please head to `https://github.co
DIRECTORY_OF_CSV_CONTENT=XXX
```

3. Run the script
3. Run the script to deduplicate by username

```bash
npm run start
Expand All @@ -75,3 +75,15 @@ The above script should output something like:
```
You have a total of XX unique developers across your GitHub instances.
```

If you want to deduplicate by email (to catch users with multiple usernames), run:

```bash
npm run email
```

The email deduplication script should output something like:

```
You have a total of XX unique developers (by email) across your GitHub instances.
```
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
},
"scripts": {
"start": "npm run build && node lib/src/main.js",
"email": "npm run build && node lib/src/main-email-dedup.js",
"clean": "rimraf coverage lib tmp",
"prebuild": "npm run lint",
"build": "tsc -p tsconfig.json",
Expand Down
62 changes: 62 additions & 0 deletions src/main-email-dedup.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { readdirSync, readFileSync } from 'fs';
import { extname } from 'path';
import { parse, ParseResult } from 'papaparse';
import * as dotenv from 'dotenv';
dotenv.config({ path: '.env' });

const GLOBAL_DIRECTORY = process.env.DIRECTORY_OF_CSV_CONTENT || '';

// Extended type to include email field (email might be optional)
type CSVDataWithEmail = {
'User login': string;
'Organization / repository': string;
'Last pushed date': string;
'Last pushed email'?: string;
};

export const getFilesInDirectory = async (folder: string) =>
<string[]>readdirSync(folder).map((file) => file);

export const filerByFileExtention = async (files: string[], format: string) =>
<string[]>files.filter((file) => extname(file).toLowerCase() === format);

export const readMultipleFiles = async (files: string[]) =>
<string[]>(
files.map((file) =>
readFileSync(`${GLOBAL_DIRECTORY}${file}`, { encoding: 'utf8' }),
)
);

export const convertContentToJSON = async (files: string[]) =>
files.map((file) => parse<CSVDataWithEmail>(file, { header: true }));

export const mergeFileContent = async (data: ParseResult<CSVDataWithEmail>[]) => {
const mergedData: CSVDataWithEmail[] = [];
data.forEach((file) => {
mergedData.push(...file.data);
});
return mergedData;
};

export const uniqueUsersByEmail = async (data: CSVDataWithEmail[]) =>
<string[]>(
data
.map((user) => user['Last pushed email'])
.filter((email): email is string => email != null && typeof email === 'string' && email.trim() !== '') // Type guard to filter out undefined, null, or empty emails
.map((email) => email.toLowerCase())
.filter((email, index, arr) => arr.indexOf(email) === index)
);

const run = async () => {
const files = await getFilesInDirectory(GLOBAL_DIRECTORY);
const csvFilesFound = await filerByFileExtention(files, '.csv');
const csvFiles = await readMultipleFiles(csvFilesFound);
const jsonFiles = await convertContentToJSON(csvFiles);
const content = await mergeFileContent(jsonFiles);
const unique = await uniqueUsersByEmail(content);
console.log(
`You have a total of ${unique.length} unique developers (by email) across your GitHub instances.`,
);
};

run();