Skip to content

Commit ac82318

Browse files
Rajat SharmaRajat Sharma
authored andcommitted
basic setup for command line tool for scraping with beautiful table
1 parent 9538bb0 commit ac82318

File tree

2 files changed

+152
-0
lines changed

2 files changed

+152
-0
lines changed

scraped_data.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

web_scraping_command_line_tool.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# import required modules
2+
import json
3+
import requests
4+
from datetime import datetime
5+
from urllib.parse import urlparse
6+
from bs4 import BeautifulSoup
7+
from beautifultable import BeautifulTable
8+
9+
10+
11+
def load_json(database_json_file="scraped_data.json"):
12+
"""
13+
This function will load json data from scraped_data.json file if it exist else crean an empty array
14+
"""
15+
try:
16+
with open(database_json_file, "r") as read_it:
17+
all_data_base = json.loads(read_it.read())
18+
return all_data_base
19+
except:
20+
all_data_base = dict()
21+
return all_data_base
22+
23+
24+
def save_scraped_data_in_json(data, database_json_file="scraped_data.json"):
25+
"""
26+
This function Save the scraped data in json format. scraped_data.json file if it exist else create it.
27+
if file already exist you can view previous scraped data
28+
"""
29+
file_obj = open(database_json_file, "w")
30+
file_obj.write(json.dumps(data))
31+
file_obj.close()
32+
33+
34+
def existing_scraped_data_init(json_db):
35+
"""
36+
This function init data from json file if it exist have data else create an empty one
37+
"""
38+
scraped_data = json_db.get("scraped_data")
39+
if scraped_data is None:
40+
json_db['scraped_data'] = dict()
41+
42+
return None
43+
44+
45+
def scraped_time_is():
46+
"""
47+
This function create time stamp for keep our book issue record trackable
48+
"""
49+
now = datetime.now()
50+
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
51+
return dt_string
52+
53+
def process_url_request(website_url):
54+
"""
55+
This function process provided URL get its data using requets module
56+
and contrunct soup data using BeautifulSoup for scarping
57+
"""
58+
requets_data = requests.get(website_url)
59+
if requets_data.status_code == 200:
60+
soup = BeautifulSoup(requets_data.text,'html')
61+
return soup
62+
return None
63+
64+
def proccess_beautiful_soup_data(soup):
65+
return {
66+
'title': soup.find('title').text,
67+
'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)],
68+
'all_anchors': [str(i) for i in soup.find_all('a')],
69+
'all_images_data': [ str(i) for i in soup.find_all('img')],
70+
'all_images_source_data': [ i['src'] for i in soup.find_all('img')],
71+
'all_h1_data': [i.text for i in soup.find_all('h1')],
72+
'all_h2_data': [i.text for i in soup.find_all('h2')],
73+
'all_h3_data': [i.text for i in soup.find_all('h3')],
74+
'all_p_data': [i.text for i in soup.find_all('p')]
75+
}
76+
77+
78+
79+
# Here I used infinite loop because i don't want to run it again and again.
80+
while True:
81+
82+
print(""" ================ Welcome to this scraping program =============
83+
==>> press 1 for checking existing scraped websites
84+
==>> press 2 for scrap a single website
85+
==>> press 3 for exit
86+
""")
87+
88+
choice = int(input("==>> Please enter your choice :"))
89+
90+
# Load json function called for fetching/creating data from json file.
91+
local_json_db = load_json()
92+
existing_scraped_data_init(local_json_db)
93+
94+
if choice == 1:
95+
# I used Beautiful table for presenting scraped data in a good way !!
96+
# you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html
97+
scraped_websites_table = BeautifulTable()
98+
scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title", "Scraped at", "Status"]
99+
scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED)
100+
101+
102+
local_json_db = load_json()
103+
for count, data in enumerate(local_json_db['scraped_data']):
104+
scraped_websites_table.rows.append([count + 1,
105+
local_json_db['scraped_data'][data]['alias'],
106+
local_json_db['scraped_data'][data]['domain'],
107+
local_json_db['scraped_data'][data]['title'],
108+
local_json_db['scraped_data'][data]['scraped_at'],
109+
local_json_db['scraped_data'][data]['status']])
110+
# all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']]
111+
if not local_json_db['scraped_data']:
112+
print('===> No existing data found !!!')
113+
print(scraped_websites_table)
114+
115+
elif choice == 2:
116+
print()
117+
url_for_scrap = input("===> Please enter url you want to scrap:")
118+
is_accessable = process_url_request(url_for_scrap)
119+
if is_accessable:
120+
scraped_data_packet = proccess_beautiful_soup_data(is_accessable)
121+
print()
122+
print(' =====> Data scraped successfully !!!')
123+
key_for_storing_data = input("enter alias name for saving scraped data :")
124+
scraped_data_packet['url'] = url_for_scrap
125+
scraped_data_packet['name'] = key_for_storing_data
126+
scraped_data_packet['scraped_at'] = scraped_time_is()
127+
if key_for_storing_data in local_json_db['scraped_data']:
128+
key_for_storing_data = key_for_storing_data + str(scraped_time_is())
129+
print("Provided key is already exist so data stored as : {}".format(key_for_storing_data))
130+
scraped_data_packet['alias'] = key_for_storing_data
131+
scraped_data_packet['status'] = True
132+
scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc
133+
134+
local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet
135+
print(
136+
'scraped data is:', local_json_db['scraped_data'][key_for_storing_data]
137+
)
138+
save_scraped_data_in_json(local_json_db)
139+
# load data
140+
local_json_db = load_json()
141+
print(' =====> Data saved successfully !!!')
142+
print()
143+
elif choice == 3:
144+
pass
145+
146+
elif choice == 4:
147+
print('Thank you for using !!!')
148+
break
149+
150+
else:
151+
print("enter a valid choice ")

0 commit comments

Comments
 (0)