Create GetIssues.sql

Hamel Husain · web-flow · commit 71d496b4fb4f · 2018-01-13T17:39:22.000-08:00
diff --git a/GetIssues.sql b/GetIssues.sql
@@ -0,0 +1,30 @@
+SELECT
+  url as issue_url
+  -- replace more than one white-space character in a row with a single space
+, REGEXP_REPLACE(title, r"\s{2,}", ' ') as issue_title
+, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
+
+FROM(
+    SELECT
+        JSON_EXTRACT(payload, '$.issue.html_url') as url
+        -- extract the title and body removing parentheses, brackets, and quotes
+      , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as title
+      , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as body
+    FROM `githubarchive.day.2017*`
+    WHERE 
+      -- 70 random days in 2017 (because it costs money to query these tables!!)  
+          _TABLE_SUFFIX BETWEEN '0101' and '1231'
+      and type="IssuesEvent" 
+      -- Only want the issue at a specific point otherwise will have duplicates
+      and JSON_EXTRACT(payload, '$.action') = "\"opened\"" 
+) as tbl
+
+WHERE 
+  -- the body must be at least 8 words long and the title at least 3 words long
+  --  this is an arbitrary way to filter out empty or sparse issues
+      ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
+  and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
+  -- filter out issues that have really long titles or bodies
+  --    (these are outliers, and will slow tokenization down).
+  and LENGTH(title) <= 400
+  and LENGTH(body) <= 2000