1+ # import required modules
2+ import json
3+ import requests
4+ from datetime import datetime
5+ from urllib .parse import urlparse
6+ from bs4 import BeautifulSoup
7+ from beautifultable import BeautifulTable
8+
9+
10+
11+ def load_json (database_json_file = "scraped_data.json" ):
12+ """
13+ This function will load json data from scraped_data.json file if it exist else crean an empty array
14+ """
15+ try :
16+ with open (database_json_file , "r" ) as read_it :
17+ all_data_base = json .loads (read_it .read ())
18+ return all_data_base
19+ except :
20+ all_data_base = dict ()
21+ return all_data_base
22+
23+
24+ def save_scraped_data_in_json (data , database_json_file = "scraped_data.json" ):
25+ """
26+ This function Save the scraped data in json format. scraped_data.json file if it exist else create it.
27+ if file already exist you can view previous scraped data
28+ """
29+ file_obj = open (database_json_file , "w" )
30+ file_obj .write (json .dumps (data ))
31+ file_obj .close ()
32+
33+
34+ def existing_scraped_data_init (json_db ):
35+ """
36+ This function init data from json file if it exist have data else create an empty one
37+ """
38+ scraped_data = json_db .get ("scraped_data" )
39+ if scraped_data is None :
40+ json_db ['scraped_data' ] = dict ()
41+
42+ return None
43+
44+
45+ def scraped_time_is ():
46+ """
47+ This function create time stamp for keep our book issue record trackable
48+ """
49+ now = datetime .now ()
50+ dt_string = now .strftime ("%d/%m/%Y %H:%M:%S" )
51+ return dt_string
52+
53+ def process_url_request (website_url ):
54+ """
55+ This function process provided URL get its data using requets module
56+ and contrunct soup data using BeautifulSoup for scarping
57+ """
58+ requets_data = requests .get (website_url )
59+ if requets_data .status_code == 200 :
60+ soup = BeautifulSoup (requets_data .text ,'html' )
61+ return soup
62+ return None
63+
64+ def proccess_beautiful_soup_data (soup ):
65+ return {
66+ 'title' : soup .find ('title' ).text ,
67+ 'all_anchor_href' : [i ['href' ] for i in soup .find_all ('a' , href = True )],
68+ 'all_anchors' : [str (i ) for i in soup .find_all ('a' )],
69+ 'all_images_data' : [ str (i ) for i in soup .find_all ('img' )],
70+ 'all_images_source_data' : [ i ['src' ] for i in soup .find_all ('img' )],
71+ 'all_h1_data' : [i .text for i in soup .find_all ('h1' )],
72+ 'all_h2_data' : [i .text for i in soup .find_all ('h2' )],
73+ 'all_h3_data' : [i .text for i in soup .find_all ('h3' )],
74+ 'all_p_data' : [i .text for i in soup .find_all ('p' )]
75+ }
76+
77+
78+
79+ # Here I used infinite loop because i don't want to run it again and again.
80+ while True :
81+
82+ print (""" ================ Welcome to this scraping program =============
83+ ==>> press 1 for checking existing scraped websites
84+ ==>> press 2 for scrap a single website
85+ ==>> press 3 for exit
86+ """ )
87+
88+ choice = int (input ("==>> Please enter your choice :" ))
89+
90+ # Load json function called for fetching/creating data from json file.
91+ local_json_db = load_json ()
92+ existing_scraped_data_init (local_json_db )
93+
94+ if choice == 1 :
95+ # I used Beautiful table for presenting scraped data in a good way !!
96+ # you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html
97+ scraped_websites_table = BeautifulTable ()
98+ scraped_websites_table .columns .header = ["Sr no." , "Allias name " , "Website domain" , "title" , "Scraped at" , "Status" ]
99+ scraped_websites_table .set_style (BeautifulTable .STYLE_BOX_DOUBLED )
100+
101+
102+ local_json_db = load_json ()
103+ for count , data in enumerate (local_json_db ['scraped_data' ]):
104+ scraped_websites_table .rows .append ([count + 1 ,
105+ local_json_db ['scraped_data' ][data ]['alias' ],
106+ local_json_db ['scraped_data' ][data ]['domain' ],
107+ local_json_db ['scraped_data' ][data ]['title' ],
108+ local_json_db ['scraped_data' ][data ]['scraped_at' ],
109+ local_json_db ['scraped_data' ][data ]['status' ]])
110+ # all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']]
111+ if not local_json_db ['scraped_data' ]:
112+ print ('===> No existing data found !!!' )
113+ print (scraped_websites_table )
114+
115+ elif choice == 2 :
116+ print ()
117+ url_for_scrap = input ("===> Please enter url you want to scrap:" )
118+ is_accessable = process_url_request (url_for_scrap )
119+ if is_accessable :
120+ scraped_data_packet = proccess_beautiful_soup_data (is_accessable )
121+ print ()
122+ print (' =====> Data scraped successfully !!!' )
123+ key_for_storing_data = input ("enter alias name for saving scraped data :" )
124+ scraped_data_packet ['url' ] = url_for_scrap
125+ scraped_data_packet ['name' ] = key_for_storing_data
126+ scraped_data_packet ['scraped_at' ] = scraped_time_is ()
127+ if key_for_storing_data in local_json_db ['scraped_data' ]:
128+ key_for_storing_data = key_for_storing_data + str (scraped_time_is ())
129+ print ("Provided key is already exist so data stored as : {}" .format (key_for_storing_data ))
130+ scraped_data_packet ['alias' ] = key_for_storing_data
131+ scraped_data_packet ['status' ] = True
132+ scraped_data_packet ['domain' ] = urlparse (url_for_scrap ).netloc
133+
134+ local_json_db ['scraped_data' ][key_for_storing_data ] = scraped_data_packet
135+ print (
136+ 'scraped data is:' , local_json_db ['scraped_data' ][key_for_storing_data ]
137+ )
138+ save_scraped_data_in_json (local_json_db )
139+ # load data
140+ local_json_db = load_json ()
141+ print (' =====> Data saved successfully !!!' )
142+ print ()
143+ elif choice == 3 :
144+ pass
145+
146+ elif choice == 4 :
147+ print ('Thank you for using !!!' )
148+ break
149+
150+ else :
151+ print ("enter a valid choice " )
0 commit comments