|
| 1 | +SELECT |
| 2 | + url as issue_url |
| 3 | + -- replace more than one white-space character in a row with a single space |
| 4 | +, REGEXP_REPLACE(title, r"\s{2,}", ' ') as issue_title |
| 5 | +, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body |
| 6 | + |
| 7 | +FROM( |
| 8 | + SELECT |
| 9 | + JSON_EXTRACT(payload, '$.issue.html_url') as url |
| 10 | + -- extract the title and body removing parentheses, brackets, and quotes |
| 11 | + , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as title |
| 12 | + , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as body |
| 13 | + FROM `githubarchive.day.2017*` |
| 14 | + WHERE |
| 15 | + -- 70 random days in 2017 (because it costs money to query these tables!!) |
| 16 | + _TABLE_SUFFIX BETWEEN '0101' and '1231' |
| 17 | + and type="IssuesEvent" |
| 18 | + -- Only want the issue at a specific point otherwise will have duplicates |
| 19 | + and JSON_EXTRACT(payload, '$.action') = "\"opened\"" |
| 20 | +) as tbl |
| 21 | + |
| 22 | +WHERE |
| 23 | + -- the body must be at least 8 words long and the title at least 3 words long |
| 24 | + -- this is an arbitrary way to filter out empty or sparse issues |
| 25 | + ARRAY_LENGTH(SPLIT(body, ' ')) >= 6 |
| 26 | + and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3 |
| 27 | + -- filter out issues that have really long titles or bodies |
| 28 | + -- (these are outliers, and will slow tokenization down). |
| 29 | + and LENGTH(title) <= 400 |
| 30 | + and LENGTH(body) <= 2000 |
0 commit comments