Commit c0d849c
olveyra
allow to define field extractors over text contents only, without need to re parsing. Added tests and fixed current ones, as a special flag was added on HtmlDataFragment objects
1 parent 3e6244a commit c0d849c
File tree
8 files changed
+149
-50
lines changed- scrapely
- extraction
- tests
- samples
8 files changed
+149
-50
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
96 | 96 | | |
97 | 97 | | |
98 | 98 | | |
| 99 | + | |
| 100 | + | |
| 101 | + | |
| 102 | + | |
99 | 103 | | |
100 | 104 | | |
101 | 105 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
75 | 75 | | |
76 | 76 | | |
77 | 77 | | |
| 78 | + | |
| 79 | + | |
| 80 | + | |
| 81 | + | |
| 82 | + | |
| 83 | + | |
| 84 | + | |
| 85 | + | |
| 86 | + | |
78 | 87 | | |
79 | 88 | | |
80 | 89 | | |
| |||
87 | 96 | | |
88 | 97 | | |
89 | 98 | | |
90 | | - | |
| 99 | + | |
| 100 | + | |
| 101 | + | |
| 102 | + | |
| 103 | + | |
91 | 104 | | |
92 | 105 | | |
93 | 106 | | |
| |||
111 | 124 | | |
112 | 125 | | |
113 | 126 | | |
| 127 | + | |
| 128 | + | |
| 129 | + | |
| 130 | + | |
| 131 | + | |
| 132 | + | |
| 133 | + | |
| 134 | + | |
| 135 | + | |
| 136 | + | |
114 | 137 | | |
115 | 138 | | |
116 | 139 | | |
117 | 140 | | |
118 | 141 | | |
119 | 142 | | |
120 | | - | |
| 143 | + | |
121 | 144 | | |
122 | | - | |
| 145 | + | |
123 | 146 | | |
124 | 147 | | |
| 148 | + | |
125 | 149 | | |
126 | 150 | | |
127 | | - | |
| 151 | + | |
128 | 152 | | |
129 | 153 | | |
130 | 154 | | |
| |||
171 | 195 | | |
172 | 196 | | |
173 | 197 | | |
174 | | - | |
| 198 | + | |
175 | 199 | | |
176 | 200 | | |
177 | 201 | | |
| |||
183 | 207 | | |
184 | 208 | | |
185 | 209 | | |
186 | | - | |
| 210 | + | |
187 | 211 | | |
188 | 212 | | |
189 | 213 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
211 | 211 | | |
212 | 212 | | |
213 | 213 | | |
214 | | - | |
| 214 | + | |
| 215 | + | |
215 | 216 | | |
216 | 217 | | |
217 | 218 | | |
218 | | - | |
| 219 | + | |
| 220 | + | |
219 | 221 | | |
220 | 222 | | |
221 | 223 | | |
222 | | - | |
| 224 | + | |
| 225 | + | |
223 | 226 | | |
224 | 227 | | |
225 | 228 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
212 | 212 | | |
213 | 213 | | |
214 | 214 | | |
215 | | - | |
| 215 | + | |
| 216 | + | |
216 | 217 | | |
217 | 218 | | |
218 | 219 | | |
| |||
274 | 275 | | |
275 | 276 | | |
276 | 277 | | |
277 | | - | |
| 278 | + | |
| 279 | + | |
278 | 280 | | |
279 | 281 | | |
280 | 282 | | |
281 | | - | |
| 283 | + | |
| 284 | + | |
282 | 285 | | |
283 | 286 | | |
284 | 287 | | |
285 | | - | |
| 288 | + | |
| 289 | + | |
286 | 290 | | |
287 | 291 | | |
288 | 292 | | |
| |||
306 | 310 | | |
307 | 311 | | |
308 | 312 | | |
309 | | - | |
| 313 | + | |
| 314 | + | |
310 | 315 | | |
311 | 316 | | |
312 | 317 | | |
| |||
330 | 335 | | |
331 | 336 | | |
332 | 337 | | |
333 | | - | |
| 338 | + | |
| 339 | + | |
334 | 340 | | |
335 | 341 | | |
336 | 342 | | |
| |||
345 | 351 | | |
346 | 352 | | |
347 | 353 | | |
348 | | - | |
| 354 | + | |
| 355 | + | |
349 | 356 | | |
350 | 357 | | |
351 | 358 | | |
| |||
986 | 993 | | |
987 | 994 | | |
988 | 995 | | |
989 | | - | |
| 996 | + | |
| 997 | + | |
990 | 998 | | |
991 | 999 | | |
992 | 1000 | | |
| |||
8851 | 8859 | | |
8852 | 8860 | | |
8853 | 8861 | | |
8854 | | - | |
| 8862 | + | |
| 8863 | + | |
8855 | 8864 | | |
8856 | 8865 | | |
8857 | 8866 | | |
| |||
9551 | 9560 | | |
9552 | 9561 | | |
9553 | 9562 | | |
9554 | | - | |
| 9563 | + | |
| 9564 | + | |
9555 | 9565 | | |
9556 | 9566 | | |
9557 | 9567 | | |
| |||
9576 | 9586 | | |
9577 | 9587 | | |
9578 | 9588 | | |
9579 | | - | |
| 9589 | + | |
| 9590 | + | |
9580 | 9591 | | |
9581 | 9592 | | |
9582 | 9593 | | |
| |||
21129 | 21140 | | |
21130 | 21141 | | |
21131 | 21142 | | |
21132 | | - | |
| 21143 | + | |
| 21144 | + | |
21133 | 21145 | | |
21134 | 21146 | | |
21135 | 21147 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
244 | 244 | | |
245 | 245 | | |
246 | 246 | | |
247 | | - | |
| 247 | + | |
| 248 | + | |
248 | 249 | | |
249 | 250 | | |
250 | 251 | | |
| |||
306 | 307 | | |
307 | 308 | | |
308 | 309 | | |
309 | | - | |
| 310 | + | |
| 311 | + | |
310 | 312 | | |
311 | 313 | | |
312 | 314 | | |
313 | | - | |
| 315 | + | |
| 316 | + | |
314 | 317 | | |
315 | 318 | | |
316 | 319 | | |
317 | | - | |
| 320 | + | |
| 321 | + | |
318 | 322 | | |
319 | 323 | | |
320 | 324 | | |
| |||
338 | 342 | | |
339 | 343 | | |
340 | 344 | | |
341 | | - | |
| 345 | + | |
| 346 | + | |
342 | 347 | | |
343 | 348 | | |
344 | 349 | | |
| |||
362 | 367 | | |
363 | 368 | | |
364 | 369 | | |
365 | | - | |
| 370 | + | |
| 371 | + | |
366 | 372 | | |
367 | 373 | | |
368 | 374 | | |
| |||
377 | 383 | | |
378 | 384 | | |
379 | 385 | | |
380 | | - | |
| 386 | + | |
| 387 | + | |
381 | 388 | | |
382 | 389 | | |
383 | 390 | | |
| |||
993 | 1000 | | |
994 | 1001 | | |
995 | 1002 | | |
996 | | - | |
| 1003 | + | |
| 1004 | + | |
997 | 1005 | | |
998 | 1006 | | |
999 | 1007 | | |
| |||
8828 | 8836 | | |
8829 | 8837 | | |
8830 | 8838 | | |
8831 | | - | |
| 8839 | + | |
| 8840 | + | |
8832 | 8841 | | |
8833 | 8842 | | |
8834 | 8843 | | |
| |||
9501 | 9510 | | |
9502 | 9511 | | |
9503 | 9512 | | |
9504 | | - | |
| 9513 | + | |
| 9514 | + | |
9505 | 9515 | | |
9506 | 9516 | | |
9507 | 9517 | | |
| |||
9526 | 9536 | | |
9527 | 9537 | | |
9528 | 9538 | | |
9529 | | - | |
| 9539 | + | |
| 9540 | + | |
9530 | 9541 | | |
9531 | 9542 | | |
9532 | 9543 | | |
| |||
20956 | 20967 | | |
20957 | 20968 | | |
20958 | 20969 | | |
20959 | | - | |
| 20970 | + | |
| 20971 | + | |
20960 | 20972 | | |
20961 | 20973 | | |
20962 | 20974 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
915 | 915 | | |
916 | 916 | | |
917 | 917 | | |
| 918 | + | |
| 919 | + | |
| 920 | + | |
| 921 | + | |
| 922 | + | |
| 923 | + | |
| 924 | + | |
| 925 | + | |
| 926 | + | |
| 927 | + | |
| 928 | + | |
| 929 | + | |
| 930 | + | |
| 931 | + | |
| 932 | + | |
| 933 | + | |
| 934 | + | |
| 935 | + | |
| 936 | + | |
| 937 | + | |
| 938 | + | |
| 939 | + | |
918 | 940 | | |
919 | 941 | | |
920 | 942 | | |
| |||
934 | 956 | | |
935 | 957 | | |
936 | 958 | | |
| 959 | + | |
| 960 | + | |
| 961 | + | |
937 | 962 | | |
938 | 963 | | |
939 | 964 | | |
| |||
1178 | 1203 | | |
1179 | 1204 | | |
1180 | 1205 | | |
| 1206 | + | |
| 1207 | + | |
| 1208 | + | |
| 1209 | + | |
| 1210 | + | |
| 1211 | + | |
| 1212 | + | |
| 1213 | + | |
| 1214 | + | |
| 1215 | + | |
| 1216 | + | |
| 1217 | + | |
| 1218 | + | |
1181 | 1219 | | |
1182 | 1220 | | |
1183 | 1221 | | |
| |||
0 commit comments