Skip to content

Commit 71d496b

Browse files
author
Hamel Husain
authored
Create GetIssues.sql
1 parent 133430a commit 71d496b

File tree

1 file changed

+30
-0
lines changed

1 file changed

+30
-0
lines changed

GetIssues.sql

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
SELECT
2+
url as issue_url
3+
-- replace more than one white-space character in a row with a single space
4+
, REGEXP_REPLACE(title, r"\s{2,}", ' ') as issue_title
5+
, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
6+
7+
FROM(
8+
SELECT
9+
JSON_EXTRACT(payload, '$.issue.html_url') as url
10+
-- extract the title and body removing parentheses, brackets, and quotes
11+
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as title
12+
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as body
13+
FROM `githubarchive.day.2017*`
14+
WHERE
15+
-- 70 random days in 2017 (because it costs money to query these tables!!)
16+
_TABLE_SUFFIX BETWEEN '0101' and '1231'
17+
and type="IssuesEvent"
18+
-- Only want the issue at a specific point otherwise will have duplicates
19+
and JSON_EXTRACT(payload, '$.action') = "\"opened\""
20+
) as tbl
21+
22+
WHERE
23+
-- the body must be at least 8 words long and the title at least 3 words long
24+
-- this is an arbitrary way to filter out empty or sparse issues
25+
ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
26+
and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
27+
-- filter out issues that have really long titles or bodies
28+
-- (these are outliers, and will slow tokenization down).
29+
and LENGTH(title) <= 400
30+
and LENGTH(body) <= 2000

0 commit comments

Comments
 (0)