|
| 1 | +""" |
| 2 | +generate-legacy-users.py WIKI_PAGES_PATH |
| 3 | +
|
| 4 | +""" |
| 5 | +from __future__ import absolute_import, print_function, division |
| 6 | + |
| 7 | +import os |
| 8 | +import re |
| 9 | +import sys |
| 10 | +from HTMLParser import HTMLParser |
| 11 | + |
| 12 | + |
| 13 | +def main(): |
| 14 | + path = sys.argv[1] |
| 15 | + |
| 16 | + users = {} |
| 17 | + pages = {} |
| 18 | + |
| 19 | + # Gather authors |
| 20 | + for root, dirs, files in os.walk(path): |
| 21 | + for d in dirs: |
| 22 | + revs = os.path.join(root, d, 'revisions') |
| 23 | + edit_log = os.path.join(root, d, 'edit-log') |
| 24 | + |
| 25 | + if not os.path.isdir(revs): |
| 26 | + continue |
| 27 | + |
| 28 | + if not os.path.isfile(edit_log): |
| 29 | + continue |
| 30 | + |
| 31 | + with open(edit_log, 'r') as handle: |
| 32 | + log_text = handle.read().rstrip() |
| 33 | + log_items = [x.split() for x in log_text.splitlines()] |
| 34 | + |
| 35 | + for fn in os.listdir(revs): |
| 36 | + fn = os.path.join(revs, fn) |
| 37 | + with open(fn, 'r') as handle: |
| 38 | + r_text = handle.read() |
| 39 | + |
| 40 | + if ('CategoryHomepage' in r_text or 'home page' in r_text): |
| 41 | + # User definition |
| 42 | + for item in log_items: |
| 43 | + if len(item) > 6: |
| 44 | + if item[3] in ('About_SciPy', 'SciPy', 'Cookbook(2f)MayaVi(2f)tvtk'): |
| 45 | + continue |
| 46 | + users[item[6]] = item[3] |
| 47 | + break |
| 48 | + break |
| 49 | + |
| 50 | + if 'Cookbook' in d or 'PerformancePython' in d or 'ParallelProgramming' in d: |
| 51 | + for item in log_items: |
| 52 | + pages.setdefault(d, []).append(item[6]) |
| 53 | + |
| 54 | + # Load predefined users |
| 55 | + users['1273234778.27.13541'] = 'arjen' |
| 56 | + users['1181049059.11.16046'] = 'WarrenWeckesser' |
| 57 | + users['1232509635.1.1790'] = 'WarrenWeckesser' |
| 58 | + users['1143464513.17.11899'] = 'GaelVaroquaux' |
| 59 | + users['1359829272.72.54252'] = 'FrankBreitling' |
| 60 | + users['1196968472.52.21357'] = 'jesrl' |
| 61 | + users['1310512145.5.35406'] = 'RalphMoore' |
| 62 | + users['1134987132.31.5715'] = 'AndrewStraw' |
| 63 | + users['1283944978.14.25260'] = 'UnuTbu' |
| 64 | + users['1143464513.17.11899'] = 'GaelVaroquaux' |
| 65 | + users['1150066934.85.44238'] = 'FredericPetit' |
| 66 | + users['1157157190.0.28500'] = 'AMArchibald' |
| 67 | + users['1193155369.79.45281'] = 'Elby' |
| 68 | + users['1162990926.75.41968'] = 'PauliVirtanen' |
| 69 | + users['1144823769.21.43377'] = 'AngusMcMorland' |
| 70 | + users['1199025820.05.62034'] = 'TimMichelsen' |
| 71 | + users['1165998335.9.59069'] = 'MartinSpacek' |
| 72 | + users['1169591527.88.61566'] = 'MattKnox' |
| 73 | + users['1278911090.12.12663'] = 'ChristopherCampo' |
| 74 | + users['1230492524.42.55666'] = 'nokfi' |
| 75 | + users['1166654035.38.11968'] = 'VincentNijs' |
| 76 | + users['1160664185.24.177'] = 'NeilMB' |
| 77 | + users['1148241299.31.23452'] = 'GabrielGellner' |
| 78 | + users['1143248516.72.17557'] = 'FrancescAltet' |
| 79 | + users['1138755498.13.1844'] = 'BillBaxter' |
| 80 | + users['1138639075.54.47297'] = 'jh' |
| 81 | + users['1135217126.43.265'] = 'FernandoPerez' |
| 82 | + users['1228612570.79.23812'] = 'EgorZindy' |
| 83 | + users['1166684071.04.43914'] = 'ScottSinclair' |
| 84 | + users['1153060908.53.58092'] = 'EmmanuelleGouillart' |
| 85 | + users['1152996811.03.49324'] = 'NickFotopoulos' |
| 86 | + users['1135013651.92.25239'] = 'PearuPeterson' |
| 87 | + users['1263714477.79.46523'] = 'newacct' |
| 88 | + users['1321067029.14.1791'] = 'KristjanOnu' |
| 89 | + users['1244315014.64.10666'] = 'IvoMaljevic' |
| 90 | + users['1342900640.41.32910'] = 'thomas.haslwanter' |
| 91 | + users['1138834037.11.63568'] = 'TimCera' |
| 92 | + users['1306523623.53.4799'] = 'DmitriyRybalkin' |
| 93 | + users['1316810730.93.46683'] = 'TimSwast' |
| 94 | + users['1294906831.24.3474'] = 'MikeToews' |
| 95 | + users['1259530275.5.20672'] = 'JorgeEduardoCardona' |
| 96 | + users['1254476605.52.59655'] = 'wolfganglechner' |
| 97 | + users['1220051786.85.3734'] = 'SimonHook' |
| 98 | + users['1321851999.81.53674'] = 'BAlexRobinson' |
| 99 | + users['1245975199.53.27497'] = 'DavidPowell' |
| 100 | + users['1277317890.88.15794'] = 'AlanLue' |
| 101 | + users['1249699417.54.61063'] = 'mauro' |
| 102 | + users['1151666835.94.32020'] = 'WilliamHunter' |
| 103 | + users['1209753612.57.31138'] = 'JamesNagel' |
| 104 | + users['1241897483.76.24144'] = 'DatChu' |
| 105 | + users['1245526844.29.46176'] = 'RalfGommers' |
| 106 | + users['1312558832.94.40303'] = 'Pierre_deBuyl' |
| 107 | + users['1205277370.55.64453'] = 'keflavich' |
| 108 | + users['1147324201.78.18433'] = 'MichaelMcNeilForbes' |
| 109 | + users['1139447249.42.46498'] = 'RobManagan' |
| 110 | + users['1246487580.75.24764'] = 'MarshallPerrin' |
| 111 | + users['1340544644.02.6056'] = 'WesTurner' |
| 112 | + |
| 113 | + # Print results |
| 114 | + unknowns = {} |
| 115 | + page_uid = {} |
| 116 | + |
| 117 | + unknown_counter = 1 |
| 118 | + unknown_names = {} |
| 119 | + |
| 120 | + for page, uids in sorted(pages.items()): |
| 121 | + editors = [] |
| 122 | + seen = set() |
| 123 | + for uid in uids: |
| 124 | + if uid not in users: |
| 125 | + unknowns.setdefault(uid, 0) |
| 126 | + unknowns[uid] += 1 |
| 127 | + |
| 128 | + if uid in seen: |
| 129 | + continue |
| 130 | + |
| 131 | + seen.add(uid) |
| 132 | + user = users.get(uid, 'unknown') |
| 133 | + if user == 'unknown': |
| 134 | + if uid not in unknown_names: |
| 135 | + unknown_names[uid] = "Unknown[{0}]".format(unknown_counter) |
| 136 | + unknown_counter += 1 |
| 137 | + user = unknown_names[uid] |
| 138 | + editors.append(user) |
| 139 | + |
| 140 | + if page != 'Cookbook(2f)MayaVi(2f)examples': |
| 141 | + page_uid[uids[-1]] = page |
| 142 | + |
| 143 | + page = page.replace('(2f)', '/') |
| 144 | + page = page.replace('Cookbook/', '') |
| 145 | + page = page.replace('/', '_') |
| 146 | + print("{0}: {1}".format(page, ", ".join(editors))) |
| 147 | + |
| 148 | + # Sort by unknown |
| 149 | + items = sorted(unknowns.items(), key=lambda x: (x[1], x), reverse=True) |
| 150 | + for uid, count in items: |
| 151 | + print(unknown_names[uid], ":", uid, count, page_uid.get(uid, '')) |
| 152 | + |
| 153 | + |
| 154 | +class MLStripper(HTMLParser): |
| 155 | + def __init__(self): |
| 156 | + self.reset() |
| 157 | + self.fed = [] |
| 158 | + def handle_data(self, d): |
| 159 | + self.fed.append(d) |
| 160 | + def get_data(self): |
| 161 | + return ''.join(self.fed) |
| 162 | + |
| 163 | + |
| 164 | +def strip_tags(html): |
| 165 | + s = MLStripper() |
| 166 | + s.feed(html) |
| 167 | + return s.get_data() |
| 168 | + |
| 169 | + |
| 170 | +class StringMatcher(object): |
| 171 | + def __init__(self, items): |
| 172 | + self.fuzzyset = fuzzyset.FuzzySet(gram_size_lower=3, |
| 173 | + gram_size_upper=5) |
| 174 | + |
| 175 | + for item in sorted(items): |
| 176 | + self.fuzzyset.add(item) |
| 177 | + |
| 178 | + def get(self, item): |
| 179 | + r = [] |
| 180 | + |
| 181 | + for fmt in [normalize, splitsub]: |
| 182 | + x = fmt(item) |
| 183 | + if x: |
| 184 | + q = self.fuzzyset.get(x) |
| 185 | + if q is not None: |
| 186 | + r += q |
| 187 | + |
| 188 | + r.sort(key=lambda x: -x[0]) |
| 189 | + if r: |
| 190 | + score, r = r[0] |
| 191 | + return self.aliases[r], score |
| 192 | + else: |
| 193 | + return None, 0 |
| 194 | + |
| 195 | + |
| 196 | +if __name__ == "__main__": |
| 197 | + main() |
0 commit comments