Skip to content

Commit 8adaf3e

Browse files
committed
1 parent 2f31646 commit 8adaf3e

File tree

2 files changed

+27
-6
lines changed

2 files changed

+27
-6
lines changed

google_scholar_crawler/main.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,31 @@
1-
from scholarly import scholarly
1+
from scholarly import scholarly, ProxyGenerator
22
import jsonpickle
33
import json
44
from datetime import datetime
55
import os
6+
import time
7+
8+
max_attempts = 100
9+
wait_seconds = 600 # 10 minutes
10+
11+
for attempt in range(1, max_attempts + 1):
12+
try:
13+
print(f"Attempt {attempt}:")
14+
# Setup proxy
15+
pg = ProxyGenerator()
16+
pg.FreeProxies() # Use free rotating proxies
17+
scholarly.use_proxy(pg)
18+
19+
author: dict = scholarly.search_author_id(os.environ['GOOGLE_SCHOLAR_ID'])
20+
scholarly.fill(author, sections=['basics', 'indices', 'counts', 'publications'])
21+
print(f"Attempt {attempt} success")
22+
break # Exit loop on first success
23+
except Exception as e:
24+
print(f"Attempt {attempt} failed with error: {e}")
25+
time.sleep(wait_seconds)
26+
else:
27+
print("All 100 attempts failed.")
628

7-
author: dict = scholarly.search_author_id(os.environ['GOOGLE_SCHOLAR_ID'])
8-
scholarly.fill(author, sections=['basics', 'indices', 'counts', 'publications'])
929
name = author['name']
1030
author['updated'] = str(datetime.now())
1131
author['publications'] = {v['author_pub_id']:v for v in author['publications']}
@@ -20,4 +40,4 @@
2040
"message": f"{author['citedby']}",
2141
}
2242
with open(f'results/gs_data_shieldsio.json', 'w') as outfile:
23-
json.dump(shieldio_data, outfile, ensure_ascii=False)
43+
json.dump(shieldio_data, outfile, ensure_ascii=False)
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
jsonpickle==1.4.2
2-
scholarly==1.5.1
1+
jsonpickle==4.0.5
2+
scholarly==1.7.11
3+
httpx==0.23.3

0 commit comments

Comments
 (0)