Skip to content

Commit 830021b

Browse files
author
Miltos Allamanis
committed
Add tokenizers to project.
1 parent 6b8f409 commit 830021b

File tree

11 files changed

+570
-0
lines changed

11 files changed

+570
-0
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio 15
4+
VisualStudioVersion = 15.0.28010.2026
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CsharpTokenizer", "CsharpTokenizer\CsharpTokenizer.csproj", "{D2EB7C17-FD56-46D2-B700-DF485B36A9CC}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{D2EB7C17-FD56-46D2-B700-DF485B36A9CC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{D2EB7C17-FD56-46D2-B700-DF485B36A9CC}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{D2EB7C17-FD56-46D2-B700-DF485B36A9CC}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{D2EB7C17-FD56-46D2-B700-DF485B36A9CC}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {4D42F867-AA8D-43CB-8473-32C34A7C79E2}
24+
EndGlobalSection
25+
EndGlobal
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp2.1</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<PackageReference Include="Microsoft.CodeAnalysis.CSharp" Version="2.9.0" />
10+
<PackageReference Include="Newtonsoft.Json" Version="11.0.2" />
11+
</ItemGroup>
12+
13+
</Project>
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
using Microsoft.CodeAnalysis;
2+
using Microsoft.CodeAnalysis.CSharp;
3+
using Newtonsoft.Json;
4+
using System;
5+
using System.Collections.Generic;
6+
using System.Diagnostics;
7+
using System.IO;
8+
using System.IO.Compression;
9+
using System.Linq;
10+
using System.Threading.Tasks;
11+
12+
namespace CsharpTokenizer
13+
{
14+
class Program
15+
{
16+
static void Main(string[] args)
17+
{
18+
if (args.Length != 3)
19+
{
20+
Console.WriteLine("Usage <projectsFolder> <outputFolder> true|false");
21+
return;
22+
}
23+
Parallel.ForEach(
24+
Directory.EnumerateDirectories(args[0]),
25+
d=>ExtractForProjectFolder(d, args[1], bool.Parse(args[2]), args[0])
26+
);
27+
}
28+
29+
public static void ExtractForProjectFolder(string projectDir, string outputDir, bool onlyIdentifiers, string baseDir)
30+
{
31+
var allFiles = Directory.EnumerateFiles(projectDir, "*.cs", SearchOption.AllDirectories);
32+
33+
var projectDirName = Path.GetFileName(projectDir);
34+
35+
using (var fileStream = File.Create(Path.Combine(outputDir, projectDirName + ".jsonl.gz")))
36+
using (var gzipStream = new GZipStream(fileStream, CompressionMode.Compress, false))
37+
using (var textStream = new StreamWriter(gzipStream))
38+
{
39+
foreach (var fileJson in allFiles.AsParallel().Select(f => GetJsonForFile(f, onlyIdentifiers, baseDir)))
40+
{
41+
textStream.WriteLine(fileJson);
42+
}
43+
}
44+
}
45+
46+
private static string GetJsonForFile(string filepath, bool onlyIdentifiers, string baseDir)
47+
{
48+
var tokens = GetFileIdentifierTokens(filepath, onlyIdentifiers);
49+
Debug.Assert(filepath.StartsWith(baseDir));
50+
var relativePath = Path.GetRelativePath(baseDir, filepath);
51+
var tokenData = new TokenData()
52+
{
53+
tokens = tokens.ToArray(),
54+
filename = relativePath
55+
};
56+
57+
return JsonConvert.SerializeObject(tokenData);
58+
}
59+
60+
private static IEnumerable<string> GetFileIdentifierTokens(string filepath, bool onlyIdentifiers)
61+
{
62+
var tokens = GetASTFromFile(filepath).GetRoot().DescendantTokens();
63+
if (onlyIdentifiers) {
64+
tokens = tokens.Where(t => t.IsKind(SyntaxKind.IdentifierToken));
65+
}
66+
return tokens.Select(t => t.Text);
67+
}
68+
69+
private static IEnumerable<SyntaxToken> GetFileTokens(string filepath) =>
70+
GetASTFromFile(filepath).GetRoot().DescendantTokens();
71+
72+
private static SyntaxTree GetASTFromFile(string filePath)
73+
{
74+
using (var rawSource = new StreamReader(filePath))
75+
{
76+
return CSharpSyntaxTree.ParseText(rawSource.ReadToEnd());
77+
}
78+
}
79+
}
80+
81+
public struct TokenData
82+
{
83+
public string filename;
84+
public string[] tokens;
85+
}
86+
}

tokenizers/java/pom.xml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0"
3+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<modelVersion>4.0.0</modelVersion>
6+
7+
<groupId>DPU</groupId>
8+
<!-- The name of this project (actually, the name of the artifact, which is the thing that this project produces. A jar in this case.) -->
9+
<artifactId>javatokenizer</artifactId>
10+
<!-- The version of this project. SNAPSHOT means "we're still working on it" -->
11+
<version>1.0-SNAPSHOT</version>
12+
13+
<properties>
14+
<!-- Tell Maven we want to use Java 8 -->
15+
<maven.compiler.source>1.8</maven.compiler.source>
16+
<maven.compiler.target>1.8</maven.compiler.target>
17+
<!-- Tell Maven to treat all source files as UTF-8 -->
18+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
19+
</properties>
20+
21+
<dependencies>
22+
<!-- Here are all your dependencies. Currently only one. These are automatically downloaded from https://mvnrepository.com/ -->
23+
<dependency>
24+
<groupId>com.github.javaparser</groupId>
25+
<artifactId>javaparser-symbol-solver-core</artifactId>
26+
<version>3.6.19</version>
27+
</dependency>
28+
<!-- JavaParser itself is not a dependency here. It gets included indirectly through java-symbol-solver-core -->
29+
<dependency>
30+
<groupId>commons-io</groupId>
31+
<artifactId>commons-io</artifactId>
32+
<version>2.6</version>
33+
</dependency>
34+
<dependency>
35+
<groupId>com.google.code.gson</groupId>
36+
<artifactId>gson</artifactId>
37+
<version>2.8.5</version>
38+
</dependency>
39+
</dependencies>
40+
</project>
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
package javatokenizer;
2+
3+
import org.apache.commons.io.FileUtils;
4+
import org.apache.commons.io.filefilter.DirectoryFileFilter;
5+
import org.apache.commons.io.filefilter.TrueFileFilter;
6+
7+
import java.io.FileOutputStream;
8+
import java.io.FilenameFilter;
9+
import java.io.File;
10+
import java.io.OutputStreamWriter;
11+
import java.io.IOException;
12+
import java.io.Writer;
13+
import java.nio.charset.Charset;
14+
import java.nio.file.Paths;
15+
import java.util.ArrayList;
16+
import java.util.Arrays;
17+
import java.util.Iterator;
18+
import java.util.List;
19+
import java.util.Spliterators;
20+
import java.util.stream.StreamSupport;
21+
import java.util.zip.GZIPOutputStream;
22+
23+
import com.github.javaparser.JavaParser;
24+
import com.github.javaparser.JavaToken;
25+
import com.github.javaparser.ast.CompilationUnit;
26+
import com.google.gson.Gson;
27+
import com.google.gson.GsonBuilder;
28+
29+
30+
public class Extractor {
31+
32+
public static void main(String[] args) throws IOException {
33+
if (args.length != 3) {
34+
System.err.println("Usage <projectsFolder> <outputFolder> true|false");
35+
System.exit(-1);
36+
}
37+
38+
File projectFolder = new File(args[0]);
39+
File outputFolder = new File(args[1]);
40+
41+
String[] projectFolders = projectFolder.list(new FilenameFilter() {
42+
@Override
43+
public boolean accept(File current, String name) {
44+
return new File(current, name).isDirectory();
45+
}
46+
});
47+
Arrays.stream(projectFolders).forEach(f->ExtractForFolder(new File(projectFolder, f),
48+
outputFolder, Boolean.parseBoolean(args[2]), projectFolder));
49+
}
50+
51+
public static void ExtractForFolder(File projectFolder, File outputFolder, boolean onlyIdentifiers, File baseFolder) {
52+
Iterator<File> allFiles = FileUtils.iterateFiles(projectFolder, new String[] {"java"}, true);
53+
try {
54+
FileOutputStream output = new FileOutputStream(Paths.get(outputFolder.toPath().toString(), projectFolder.getName() + ".jsonl.gz").toFile());
55+
Gson gson = new GsonBuilder().create();
56+
57+
try {
58+
Writer writer = new OutputStreamWriter(new GZIPOutputStream(output), "UTF-8");
59+
Iterable<File> fileIter = ()-> allFiles;
60+
StreamSupport.stream(
61+
fileIter.spliterator(), true).map(f-> TokenizeFile(f, onlyIdentifiers, baseFolder))
62+
.map(t->gson.toJson(t)).filter(g->g!=null).sequential().forEach(g->{
63+
try{
64+
writer.write(g);
65+
writer.write('\n');
66+
} catch (IOException ioe) {
67+
// really?
68+
ioe.printStackTrace();
69+
}
70+
});
71+
writer.close();
72+
} catch(Exception e) {
73+
System.out.println("Error for project " + projectFolder + ": " + e);
74+
e.printStackTrace();
75+
} finally {
76+
output.close();
77+
}
78+
} catch (IOException e) {
79+
System.out.println("Error for project " + projectFolder + ": " + e);
80+
e.printStackTrace();
81+
}
82+
}
83+
84+
public static class SerializableTokens {
85+
String filename;
86+
List<String> tokens;
87+
}
88+
89+
public static SerializableTokens TokenizeFile(File sourceFile, boolean onlyIdentifiers, File baseDirectory) {
90+
CompilationUnit cu;
91+
System.out.println("Tokenizing " + sourceFile + "...");
92+
try {
93+
cu = JavaParser.parse(
94+
FileUtils.readFileToString(
95+
sourceFile,
96+
Charset.defaultCharset()));
97+
List<String> allTokens = new ArrayList<>();
98+
for(JavaToken token : cu.getTokenRange().get()) {
99+
JavaToken.Kind tokenKind = JavaToken.Kind.valueOf(token.getKind());
100+
if (tokenKind == JavaToken.Kind.SPACE ||
101+
tokenKind == JavaToken.Kind.EOF ||
102+
tokenKind == JavaToken.Kind.WINDOWS_EOL ||
103+
tokenKind == JavaToken.Kind.UNIX_EOL ||
104+
tokenKind == JavaToken.Kind.OLD_MAC_EOL ||
105+
tokenKind == JavaToken.Kind.SINGLE_LINE_COMMENT ||
106+
tokenKind == JavaToken.Kind.ENTER_JAVADOC_COMMENT ||
107+
tokenKind == JavaToken.Kind.JAVADOC_COMMENT ||
108+
tokenKind == JavaToken.Kind.ENTER_MULTILINE_COMMENT ||
109+
tokenKind == JavaToken.Kind.MULTI_LINE_COMMENT ||
110+
tokenKind == JavaToken.Kind.COMMENT_CONTENT) {
111+
continue;
112+
}
113+
if (!onlyIdentifiers || tokenKind == JavaToken.Kind.IDENTIFIER) {
114+
allTokens.add(token.getText());
115+
}
116+
}
117+
118+
SerializableTokens serializableObject = new SerializableTokens();
119+
serializableObject.filename = baseDirectory.toURI().relativize(sourceFile.toURI()).toString();
120+
serializableObject.tokens = allTokens;
121+
return serializableObject;
122+
} catch (Exception e) {
123+
System.err.println("Failed to parse " + sourceFile);
124+
//e.printStackTrace();
125+
return null;
126+
} catch (StackOverflowError e) {
127+
return null;
128+
}
129+
}
130+
}

tokenizers/javascript/.gitignore

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Logs
2+
logs
3+
*.log
4+
npm-debug.log*
5+
yarn-debug.log*
6+
yarn-error.log*
7+
8+
# Runtime data
9+
pids
10+
*.pid
11+
*.seed
12+
*.pid.lock
13+
14+
# Directory for instrumented libs generated by jscoverage/JSCover
15+
lib-cov
16+
17+
# Coverage directory used by tools like istanbul
18+
coverage
19+
20+
# nyc test coverage
21+
.nyc_output
22+
23+
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
24+
.grunt
25+
26+
# Bower dependency directory (https://bower.io/)
27+
bower_components
28+
29+
# node-waf configuration
30+
.lock-wscript
31+
32+
# Compiled binary addons (https://nodejs.org/api/addons.html)
33+
build/Release
34+
35+
# Dependency directories
36+
node_modules/
37+
jspm_packages/
38+
39+
# TypeScript v1 declaration files
40+
typings/
41+
42+
# Optional npm cache directory
43+
.npm
44+
45+
# Optional eslint cache
46+
.eslintcache
47+
48+
# Optional REPL history
49+
.node_repl_history
50+
51+
# Output of 'npm pack'
52+
*.tgz
53+
54+
# Yarn Integrity file
55+
.yarn-integrity
56+
57+
# dotenv environment variables file
58+
.env
59+
60+
# parcel-bundler cache (https://parceljs.org/)
61+
.cache
62+
63+
# next.js build output
64+
.next
65+
66+
# nuxt.js build output
67+
.nuxt
68+
69+
# vuepress build output
70+
.vuepress/dist
71+
72+
# Serverless directories
73+
.serverless

tokenizers/javascript/package-lock.json

Lines changed: 13 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)