Skip to content

Commit 9e117c7

Browse files
committed
Add first working version of tutorial indexing
1 parent 0c01ac6 commit 9e117c7

File tree

2 files changed

+121
-0
lines changed

2 files changed

+121
-0
lines changed

_search/server/index-sites.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import logging, os, sys
44
import jekyll, ijsite, tsutil
5+
import tutorials
56

67

78
logger = logging.getLogger('indexer')
@@ -12,6 +13,8 @@ def load_site(siteroot):
1213
return jekyll.load_jekyll_site(siteroot)
1314
if ijsite.is_imagej_website(siteroot):
1415
return ijsite.load_site(siteroot)
16+
if tutorials.is_imagej_tutorials(siteroot):
17+
return tutorials.load_imagej_tutorials(siteroot)
1518
return None
1619

1720

_search/server/tutorials.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/bin/env python
2+
3+
# Parse ImageJ tutorials into documents for
4+
# use with their own searchable collection.
5+
6+
import logging, traceback, re, sys
7+
import json
8+
from parseutil import first_sentence
9+
from pathlib import Path
10+
from pprint import pprint
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
def is_imagej_tutorials(root):
16+
java = Path(root) / 'java'
17+
notebooks = Path(root) / 'notebooks'
18+
return java.isdir() and notebooks.isdir()
19+
20+
21+
def parse_java_source(path):
22+
logger.debug(f'Parsing Java source file {path}...')
23+
24+
with open(path) as f:
25+
lines = json.load(f)
26+
27+
# This is dumb -- do we want to do better?
28+
doc = {}
29+
doc['content'] = ''.join(lines)
30+
31+
return doc
32+
33+
34+
def parse_notebook(path):
35+
logger.debug(f'Parsing notebook {path}...')
36+
37+
with open(path) as f:
38+
data = json.load(f)
39+
40+
doc = {}
41+
doc['content'] = ''
42+
for cell in data['cells']:
43+
# TODO: implement process_cell: extract source and output(s) if present
44+
doc['content'] += process_cell(cell)
45+
46+
return doc
47+
48+
# type of cell is dict
49+
def process_cell(cell):
50+
result = ''
51+
52+
if 'source' in cell:
53+
result += filter_data("".join(cell['source']))
54+
55+
# case 1: code cell
56+
if 'outputs' in cell:
57+
for o in cell['outputs']:
58+
if 'text' in o:
59+
result += filter_data("".join(o['text']))
60+
if 'data' in o:
61+
for k,v in o['data'].items():
62+
if k in ('text/html', 'text/plain'):
63+
result += filter_data("".join(v))
64+
65+
return result
66+
67+
# takes input of string; filters html and other data
68+
def filter_data(data):
69+
# if len(data) > 5000:
70+
filtered = re.sub('<[^>]*>', '', data)
71+
return filtered # this string will have markup with it
72+
# TODO: remove markup from data
73+
74+
def load_imagej_tutorials(root):
75+
"""
76+
Loads the content from the given imagej/tutorials folder.
77+
See: https://github.com/imagej/tutorials
78+
"""
79+
java = Path(root) / 'java'
80+
notebooks = Path(root) / 'notebooks'
81+
if not java.is_dir() or not notebooks.is_dir():
82+
raise ValueError(f'The path {root} does not appear to be a Jekyll site.')
83+
84+
logger.info('Loading content...')
85+
documents = []
86+
87+
for javafile in java.rglob("**/*.java"):
88+
try:
89+
doc = parse_java_source(javafile)
90+
if doc:
91+
documents.append(doc)
92+
except:
93+
logger.error(f'Failed to parse {Path}:')
94+
traceback.print_exc()
95+
logger.info(f'Loaded {len(documents)} documents from Java source files')
96+
97+
for nbfile in notebooks.rglob("**/*.ipynb"):
98+
try:
99+
doc = parse_notebook(nbfile)
100+
if doc:
101+
nbpath = str(nbfile)[len(str(root)) + 1:]
102+
doc['url'] = f'https://github.com/imagej/tutorials/blob/master/{nbpath}'
103+
documents.append(doc)
104+
except:
105+
logger.error(f'Failed to parse {Path}:')
106+
traceback.print_exc()
107+
logger.info(f'Loaded {len(documents)} documents from Jupyter notebooks')
108+
109+
return documents
110+
111+
def main(args):
112+
docs = load_imagej_tutorials(args[0])
113+
for doc in docs:
114+
# pprint(doc)
115+
print(doc['url'])
116+
117+
if __name__ == '__main__':
118+
main(['/Users/jackrueth/code/imagej/tutorials'])

0 commit comments

Comments
 (0)