From 264506561649c7d85ee07e8452e42c9ef35c2a1b Mon Sep 17 00:00:00 2001 From: chintu4 <67625427+chintu4@users.noreply.github.com> Date: Fri, 7 Oct 2022 09:23:14 +0530 Subject: [PATCH 1/4] Add files via upload --- reading_time_cal.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 reading_time_cal.py diff --git a/reading_time_cal.py b/reading_time_cal.py new file mode 100644 index 0000000..2161cbd --- /dev/null +++ b/reading_time_cal.py @@ -0,0 +1,43 @@ +import bs4 +import urllib, re +import sys + +# Words per minute +WPM = 200 +WORD_LENGTH = 5 + +# 1 +def extract_text(url): + html = urllib.request.urlopen(url).read() + soup = bs4.BeautifulSoup(html, 'html.parser') + texts = soup.findAll(text=True) + return texts + +def is_visible(element): + if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: + return False + elif isinstance(element, bs4.element.Comment): + return False + elif element.string == "\n": + return False + return True + +# 2 +def filter_visible_text(page_texts): + return filter(is_visible, page_texts) + +def count_words_in_text(text_list, word_length): + total_words = 0 + for current_text in text_list: + total_words += len(current_text)/word_length + return total_words + +# 3 +def estimate_reading_time(url): + texts = extract_text(url) + filtered_text = filter_visible_text(texts) + total_words = count_words_in_text(filtered_text, WORD_LENGTH) + return total_words/WPM + + +print( estimate_reading_time(sys.argv[1])) From 97da52c2f1bfd000c7c0e41392b5486fe162b99c Mon Sep 17 00:00:00 2001 From: chintu4 <67625427+chintu4@users.noreply.github.com> Date: Fri, 7 Oct 2022 09:26:19 +0530 Subject: [PATCH 2/4] Add files via upload --- reading_time_estimator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reading_time_estimator.py b/reading_time_estimator.py index 6bbea89..2161cbd 100644 --- a/reading_time_estimator.py +++ b/reading_time_estimator.py @@ -1,5 +1,6 @@ import bs4 import urllib, re +import sys # Words per minute WPM = 200 @@ -7,7 +8,7 @@ # 1 def extract_text(url): - html = urllib.urlopen(url).read() + html = urllib.request.urlopen(url).read() soup = bs4.BeautifulSoup(html, 'html.parser') texts = soup.findAll(text=True) return texts @@ -38,4 +39,5 @@ def estimate_reading_time(url): total_words = count_words_in_text(filtered_text, WORD_LENGTH) return total_words/WPM -print estimate_reading_time("http://www.assafelovic.com/blog/2017/6/27/estimating-an-articles-reading-time") + +print( estimate_reading_time(sys.argv[1])) From c8a76ab6832a5a1aeb8260d4fd8d2cf25d9e3dbc Mon Sep 17 00:00:00 2001 From: chintu4 <67625427+chintu4@users.noreply.github.com> Date: Fri, 7 Oct 2022 09:26:34 +0530 Subject: [PATCH 3/4] Delete reading_time_cal.py --- reading_time_cal.py | 43 ------------------------------------------- 1 file changed, 43 deletions(-) delete mode 100644 reading_time_cal.py diff --git a/reading_time_cal.py b/reading_time_cal.py deleted file mode 100644 index 2161cbd..0000000 --- a/reading_time_cal.py +++ /dev/null @@ -1,43 +0,0 @@ -import bs4 -import urllib, re -import sys - -# Words per minute -WPM = 200 -WORD_LENGTH = 5 - -# 1 -def extract_text(url): - html = urllib.request.urlopen(url).read() - soup = bs4.BeautifulSoup(html, 'html.parser') - texts = soup.findAll(text=True) - return texts - -def is_visible(element): - if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: - return False - elif isinstance(element, bs4.element.Comment): - return False - elif element.string == "\n": - return False - return True - -# 2 -def filter_visible_text(page_texts): - return filter(is_visible, page_texts) - -def count_words_in_text(text_list, word_length): - total_words = 0 - for current_text in text_list: - total_words += len(current_text)/word_length - return total_words - -# 3 -def estimate_reading_time(url): - texts = extract_text(url) - filtered_text = filter_visible_text(texts) - total_words = count_words_in_text(filtered_text, WORD_LENGTH) - return total_words/WPM - - -print( estimate_reading_time(sys.argv[1])) From d54ece405189e4eefe91d7d422a1d66ec0a46a7e Mon Sep 17 00:00:00 2001 From: chintu4 <67625427+chintu4@users.noreply.github.com> Date: Fri, 7 Oct 2022 09:27:10 +0530 Subject: [PATCH 4/4] Create README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..4b39eef --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# reading_time_estimator +Python script for estimating an article's reading time + this works with python3