Skip to content

Commit 66c13ab

Browse files
author
Miltos
authored
Merge pull request #4 from BillHally/master
Add an F# tokenizer
2 parents 176eabe + ea4042a commit 66c13ab

File tree

3 files changed

+130
-0
lines changed

3 files changed

+130
-0
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio 15
4+
VisualStudioVersion = 15.0.26124.0
5+
MinimumVisualStudioVersion = 15.0.26124.0
6+
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "FSharpTokenizer", "FSharpTokenizer\FSharpTokenizer.fsproj", "{F658D8FD-352D-4112-A3E8-A1EE26DC7540}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Debug|x64 = Debug|x64
12+
Debug|x86 = Debug|x86
13+
Release|Any CPU = Release|Any CPU
14+
Release|x64 = Release|x64
15+
Release|x86 = Release|x86
16+
EndGlobalSection
17+
GlobalSection(SolutionProperties) = preSolution
18+
HideSolutionNode = FALSE
19+
EndGlobalSection
20+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
21+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
22+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Debug|Any CPU.Build.0 = Debug|Any CPU
23+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Debug|x64.ActiveCfg = Debug|Any CPU
24+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Debug|x64.Build.0 = Debug|Any CPU
25+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Debug|x86.ActiveCfg = Debug|Any CPU
26+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Debug|x86.Build.0 = Debug|Any CPU
27+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Release|Any CPU.ActiveCfg = Release|Any CPU
28+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Release|Any CPU.Build.0 = Release|Any CPU
29+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Release|x64.ActiveCfg = Release|Any CPU
30+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Release|x64.Build.0 = Release|Any CPU
31+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Release|x86.ActiveCfg = Release|Any CPU
32+
{F658D8FD-352D-4112-A3E8-A1EE26DC7540}.Release|x86.Build.0 = Release|Any CPU
33+
EndGlobalSection
34+
EndGlobal
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
<PropertyGroup>
3+
<OutputType>Exe</OutputType>
4+
<TargetFramework>netcoreapp3.1</TargetFramework>
5+
</PropertyGroup>
6+
7+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
8+
<Tailcalls>true</Tailcalls>
9+
</PropertyGroup>
10+
11+
<ItemGroup>
12+
<Compile Include="Program.fs" />
13+
</ItemGroup>
14+
15+
<ItemGroup>
16+
<PackageReference Include="FSharp.Compiler.Service" Version="35.0.0" />
17+
<PackageReference Include="Newtonsoft.Json" Version="12.0.3" />
18+
<PackageReference Update="FSharp.Core" Version="4.7.2" />
19+
</ItemGroup>
20+
</Project>
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
module FSharpTokenizer
2+
3+
open System
4+
open System.IO
5+
open System.IO.Compression
6+
open System.Linq
7+
8+
open FSharp.Compiler.SourceCodeServices
9+
10+
open Newtonsoft.Json
11+
12+
let rec tokenizeLine (tokenizer : FSharpLineTokenizer) state (line : string) (tokens : _ list) =
13+
match tokenizer.ScanToken(state) with
14+
| Some tok, state ->
15+
let value = line.Substring(tok.LeftColumn, tok.RightColumn - tok.LeftColumn + 1)
16+
tokenizeLine tokenizer state line ({| Token = tok; Value = value |}::tokens)
17+
| None, state -> state, tokens
18+
19+
let rec tokenizeLines (sourceTok : FSharpSourceTokenizer) state count tokens lines =
20+
match lines with
21+
| line::lines ->
22+
let tokenizer = sourceTok.CreateLineTokenizer(line)
23+
let state, tokens = tokenizeLine tokenizer state line tokens
24+
tokenizeLines sourceTok state (count + 1) tokens lines
25+
| [] -> List.rev tokens
26+
27+
let tokenizeFile (filePath : string) =
28+
[
29+
use rawSource = new StreamReader(filePath)
30+
31+
let mutable line = rawSource.ReadLine()
32+
while not (isNull line) do
33+
yield line
34+
line <- rawSource.ReadLine()
35+
]
36+
|> tokenizeLines (FSharpSourceTokenizer([], Some filePath)) FSharpTokenizerLexState.Initial 1 []
37+
38+
let getFileIdentifierTokens (filepath : string) (onlyIdentifiers : bool) =
39+
let tokens = tokenizeFile filepath
40+
if onlyIdentifiers then
41+
tokens
42+
|> List.filter (fun t -> t.Token.CharClass = FSharpTokenCharKind.Identifier)
43+
else
44+
tokens
45+
|> List.map (fun t -> t.Value)
46+
47+
let getJsonForFile (filepath : string) (onlyIdentifiers : bool) (baseDir : string) : string =
48+
{|
49+
tokens = getFileIdentifierTokens filepath onlyIdentifiers
50+
filename = Path.GetRelativePath(baseDir, filepath)
51+
|}
52+
|> JsonConvert.SerializeObject
53+
54+
let extractForProjectFolder (baseDir : string) (outputDir : string) (onlyIdentifiers : bool) (projectDir : string) =
55+
56+
let projectDirName = Path.GetFileName(projectDir)
57+
58+
use fileStream = File.Create(Path.Combine(outputDir, projectDirName + ".jsonl.gz"))
59+
use gzipStream = new GZipStream(fileStream, CompressionMode.Compress, false)
60+
use textStream = new StreamWriter(gzipStream)
61+
62+
let allFiles = Directory.EnumerateFiles(projectDir, "*.fs", SearchOption.AllDirectories)
63+
64+
for fileJson in allFiles.AsParallel().Select(fun f -> getJsonForFile f onlyIdentifiers baseDir) do
65+
textStream.WriteLine(fileJson)
66+
67+
[<EntryPoint>]
68+
let main argv =
69+
if argv.Length <> 3 then
70+
Console.WriteLine("Usage <projectsFolder> <outputFolder> true|false");
71+
-1
72+
else
73+
Directory.EnumerateDirectories(argv.[0]).AsParallel()
74+
|> Seq.iter (extractForProjectFolder argv.[0] argv.[1] (Boolean.Parse(argv.[2])))
75+
76+
0

0 commit comments

Comments
 (0)