Skip to content

Commit 6f15a86

Browse files
committed
add function to parse polyglot notebooks
1 parent 095678a commit 6f15a86

File tree

3 files changed

+150
-2
lines changed

3 files changed

+150
-2
lines changed

Directory.Packages.props

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
<PackageVersion Include="Newtonsoft.Json" Version="13.0.3" />
1919
<PackageVersion Include="Suave" Version="2.6.2" />
2020
<PackageVersion Include="System.Memory" Version="4.5.5" />
21+
<PackageVersion Include="System.Text.Json" Version="8.0.0" />
2122
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp" Version="4.7.0" />
2223
<PackageVersion Include="NUnit" Version="3.14.0" />
2324
<PackageVersion Include="FsUnit" Version="5.6.0" />

src/FSharp.Formatting.Literate/FSharp.Formatting.Literate.fsproj

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
<Compile Include="Contexts.fs" />
1717
<Compile Include="ParseScript.fs" />
1818
<Compile Include="ParseMarkdown.fs" />
19+
<Compile Include="ParsePynb.fs" />
1920
<Compile Include="Transformations.fs" />
2021
<Compile Include="Formatting.fs" />
2122
<Compile Include="Literate.fs" />
@@ -26,7 +27,6 @@
2627
<ProjectReference Include="..\FSharp.Formatting.Common\FSharp.Formatting.Common.fsproj" PrivateAssets="all" />
2728
<ProjectReference Include="..\FSharp.Formatting.Markdown\FSharp.Formatting.Markdown.fsproj" PrivateAssets="all" />
2829
</ItemGroup>
29-
3030
<ItemGroup>
3131
<!-- ugly hack: inline p2p libraries in NuGet package
3232
workaround for https://github.com/NuGet/Home/issues/3891 -->
@@ -38,5 +38,6 @@
3838
<ItemGroup>
3939
<PackageReference Include="FSharp.Core" />
4040
<PackageReference Include="FSharp.Compiler.Service" />
41+
<PackageReference Include="System.Text.Json" />
4142
</ItemGroup>
42-
</Project>
43+
</Project>
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
namespace FSharp.Formatting.Literate
2+
3+
open System.IO
4+
open System.Text.Json
5+
6+
module internal ParsePynb =
7+
8+
type ParsedCell =
9+
| Code of
10+
{| lang: string
11+
source: string
12+
outputs: string[] option |}
13+
| Markdown of source: string
14+
15+
member this.ToMarkdown() =
16+
match this with
17+
| Markdown source -> source
18+
| Code code ->
19+
let codeBlock = sprintf $"```{code.lang}\n{code.source}\n```"
20+
21+
match code.outputs with
22+
| None -> codeBlock
23+
| Some outputs ->
24+
let outputsString = outputs |> String.concat "\n\n"
25+
sprintf $"{codeBlock}\n\n{outputsString}\n\n"
26+
27+
module Output =
28+
let (|TextHtml|_|) (x: JsonElement) =
29+
match x.TryGetProperty("text/html") with
30+
| true, html ->
31+
html.EnumerateArray()
32+
|> Seq.map (fun x -> x.GetString())
33+
|> String.concat ""
34+
|> Some
35+
| _ -> None
36+
37+
let (|TextPlain|_|) (x: JsonElement) =
38+
match x.TryGetProperty("text/plain") with
39+
| true, text ->
40+
let text = text.EnumerateArray() |> Seq.map (fun x -> x.GetString()) |> String.concat ""
41+
42+
Some(
43+
"""<table class="pre"><tr><td><pre><code>"""
44+
+ text
45+
+ """</code></pre></td></tr></table>"""
46+
)
47+
| _ -> None
48+
49+
let (|DisplayData|_|) (x: JsonElement) =
50+
if x.GetProperty("output_type").GetString() = "display_data" then
51+
match x.GetProperty("data") with
52+
| TextHtml html -> html
53+
| TextPlain text -> text
54+
| s -> failwith $"unknown ouptut {s}"
55+
|> Some
56+
else
57+
None
58+
59+
let (|Stream|_|) (x: JsonElement) =
60+
if x.GetProperty("output_type").GetString() = "stream" then
61+
let text =
62+
x.GetProperty("text").EnumerateArray()
63+
|> Seq.map (fun x -> x.GetString())
64+
|> String.concat ""
65+
66+
Some(
67+
"""<table class="pre"><tr><td><pre><code>"""
68+
+ text
69+
+ """</code></pre></td></tr></table>"""
70+
)
71+
else
72+
None
73+
74+
let parse (output: JsonElement) =
75+
match output with
76+
| Stream stream -> stream
77+
| DisplayData displayData -> displayData
78+
| s -> failwith $"""unknown output {s.GetProperty("output_type").GetString()}"""
79+
80+
let getSource (cell: JsonElement) =
81+
let source =
82+
match cell.TryGetProperty("source") with
83+
| true, xs -> xs.EnumerateArray()
84+
| _ -> failwith "no source"
85+
86+
source |> Seq.map (fun x -> x.GetString()) |> String.concat ""
87+
88+
let collectOutputs (cell: JsonElement) =
89+
match cell.TryGetProperty("outputs") with
90+
| true, outputs ->
91+
let xs = outputs.EnumerateArray()
92+
93+
if Seq.isEmpty xs then
94+
None
95+
else
96+
xs |> Seq.map Output.parse |> Seq.toArray |> Some
97+
| _ -> None
98+
99+
let getCode (cell: JsonElement) =
100+
let lang =
101+
let metadata (elem: JsonElement) =
102+
match elem.TryGetProperty("metadata") with
103+
| false, _ -> failwith "Code cell does not have metadata"
104+
| true, metadata -> metadata
105+
106+
let languageInfo (metadata: JsonElement) =
107+
match metadata.TryGetProperty("polyglot_notebook") with
108+
| false, _ -> failwith "code cell does not have metadata.polyglot_notebook"
109+
| true, language_info -> language_info
110+
111+
let kernelName (languageInfo: JsonElement) =
112+
match languageInfo.TryGetProperty("kernelName") with
113+
| false, _ -> failwith "code cell does not have metadata.polyglot_notebook.kernelName"
114+
| true, name -> name.GetString()
115+
116+
cell |> metadata |> languageInfo |> kernelName
117+
118+
let source = getSource cell
119+
let outputs = collectOutputs cell
120+
121+
Code
122+
{| lang = lang
123+
source = source
124+
outputs = outputs |}
125+
126+
127+
let parseCell (cell: JsonElement) =
128+
let cell_type =
129+
match cell.TryGetProperty("cell_type") with
130+
| true, cellType -> cellType.GetString()
131+
| _ -> failwith "no cell type"
132+
133+
match cell_type with
134+
| "markdown" ->
135+
match getSource cell, collectOutputs cell with
136+
| _, Some _ -> failwith $"Markdown should not have outputs"
137+
| source, None -> Markdown source
138+
| "code" -> getCode cell
139+
| _ -> failwith $"unknown cell type {cell_type}"
140+
141+
let ipynbToMarkdown ipynbFile =
142+
let json = JsonDocument.Parse(File.ReadAllText(ipynbFile))
143+
144+
json.RootElement.GetProperty("cells").EnumerateArray()
145+
|> Seq.map (parseCell >> (fun x -> x.ToMarkdown()))
146+
|> String.concat "\n\n"

0 commit comments

Comments
 (0)