From ae0ca376d8e0ec93f26e6511d30172c5ae4b59ee Mon Sep 17 00:00:00 2001 From: Tomasz-Kluczkowski Date: Tue, 8 Jan 2019 12:21:18 +0000 Subject: [PATCH 1/9] Initial commit. Set up virtual environment. Created blank code and test files. --- data_extractor.py | 0 tests/test_data_extractor.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 data_extractor.py create mode 100644 tests/test_data_extractor.py diff --git a/data_extractor.py b/data_extractor.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py new file mode 100644 index 0000000..e69de29 From a9c9f2442adff424a297ae0a9a5904dc53bca5fd Mon Sep 17 00:00:00 2001 From: Tomasz-Kluczkowski Date: Tue, 8 Jan 2019 12:55:35 +0000 Subject: [PATCH 2/9] Add find_items method and initial test. --- data_extractor.py | 21 +++++++++++++++++++++ tests/__init__.py | 0 tests/test_data_extractor.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) create mode 100644 tests/__init__.py diff --git a/data_extractor.py b/data_extractor.py index e69de29..47cb03d 100644 --- a/data_extractor.py +++ b/data_extractor.py @@ -0,0 +1,21 @@ +from websites.resources.data import WEBSITES + + +class DataExtractor: + """ + Use to extract, cleanse and amend incorrect website data collection. + """ + def __init__(self): + self.data = WEBSITES + + def find_items(self, value): + """ + Find and return a new list of items where key "value" is greater than or equal to parameter value. + :return: list(dict), list of dictionaries matching the above filtering rule. + """ + return [item for item in self.data if item.get('value') and item.get('value') >= value] + + +# data_extractor = DataExtractor() +# print(data_extractor.find_items(4)) +# print(len(data_extractor.find_items(4))) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py index e69de29..86c11aa 100644 --- a/tests/test_data_extractor.py +++ b/tests/test_data_extractor.py @@ -0,0 +1,32 @@ +import pytest + +from data_extractor import DataExtractor + +data_extractor = DataExtractor() + + +class TestDataExtractor: + + def test_find_items(self): + expected = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'google.co.uk', + 'secure': True, + 'value': 5}, + { + 'name': 'Facebook', + 'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/', + 'domain': 'facebook.com', + 'secure': True, + 'value': 4}, + { + 'name': 'YouTube', + 'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc', + 'domain': 'youtube.com', + 'secure': True, + 'value': 5 + } + ] + assert data_extractor.find_items(4) == expected From a16333ab5e08d3e1a569168a44ada88392d230cc Mon Sep 17 00:00:00 2001 From: Tomasz-Kluczkowski Date: Tue, 8 Jan 2019 13:16:52 +0000 Subject: [PATCH 3/9] Add amend_domain_values and basic test. --- data_extractor.py | 24 ++++++++++++--- tests/test_data_extractor.py | 59 +++++++++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 6 deletions(-) diff --git a/data_extractor.py b/data_extractor.py index 47cb03d..fa5b5cb 100644 --- a/data_extractor.py +++ b/data_extractor.py @@ -8,14 +8,28 @@ class DataExtractor: def __init__(self): self.data = WEBSITES - def find_items(self, value): + def find_items(self, value=4): """ - Find and return a new list of items where key "value" is greater than or equal to parameter value. + Find and return a new list of items where key "value" is greater than or equal to parameter value. Default = 4. :return: list(dict), list of dictionaries matching the above filtering rule. """ return [item for item in self.data if item.get('value') and item.get('value') >= value] + def amend_domain_values(self, prefix='www.'): + """ + Fixes missing parts of the domain names. + :param prefix: str, prefix to add to the domain name. Default = 'www'. + :return: amended: list(dict), amended list of web records. + """ + amended = [] + for item in self.data: + if item.get('domain') and not item.get('domain').startswith(prefix): + item['domain'] = f"{prefix}{item['domain']}" + amended.append(item) + return amended + -# data_extractor = DataExtractor() -# print(data_extractor.find_items(4)) -# print(len(data_extractor.find_items(4))) +data_extractor = DataExtractor() +# print(data_extractor.amend_domain_values()) +print(data_extractor.find_items(4)) +print(len(data_extractor.find_items(4))) diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py index 86c11aa..7996016 100644 --- a/tests/test_data_extractor.py +++ b/tests/test_data_extractor.py @@ -29,4 +29,61 @@ def test_find_items(self): 'value': 5 } ] - assert data_extractor.find_items(4) == expected + assert data_extractor.find_items() == expected + + def test_amend_domain_values(self): + expected = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'www.google.co.uk', + 'secure': True, + 'value': 5}, + { + 'name': 'Facebook', + 'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/', + 'domain': 'www.facebook.com', + 'secure': True, 'value': 4}, + { + 'name': 'Bing', + 'url': 'https://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A', + 'domain': 'www.bing.com', + 'secure': False, + 'value': 3 + }, + { + 'name': 'Ask', + 'url': 'https://uk.ask.com/web?o=0&l=dir&qo=serpSearchTopBox&q=jupiter', + 'domain': 'www.ask.com', + 'secure': False, + 'value': 1}, + { + 'name': 'Duck Duck Go', + 'url': 'http://duckduckgo.com/?q=plane&t=h_&ia=web', + 'domain': 'www.duckduckgo.com', + 'secure': True, + 'value': 2 + }, + { + 'name': 'Vimeo', + 'url': 'https://vimeo.com/53812885', + 'domain': 'www.vimeo.com', + 'secure': False, + 'value': 2 + }, + { + 'name': 'YouTube', + 'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc', + 'domain': 'www.youtube.com', + 'secure': True, + 'value': 5 + }, + { + 'name': 'Daily Motion', + 'url': 'http://www.dailymotion.com/search/football', + 'domain': 'www.dailymotion.com', + 'secure': True, + 'value': 1 + } + ] + assert data_extractor.amend_domain_values() == expected From 75c8e82ffa220279655d63347af9acb9b2ccd74f Mon Sep 17 00:00:00 2001 From: Tomasz-Kluczkowski Date: Tue, 8 Jan 2019 13:18:46 +0000 Subject: [PATCH 4/9] Allow DataExtractor to accept data. --- data_extractor.py | 6 +++--- tests/test_data_extractor.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/data_extractor.py b/data_extractor.py index fa5b5cb..f0558d0 100644 --- a/data_extractor.py +++ b/data_extractor.py @@ -5,8 +5,8 @@ class DataExtractor: """ Use to extract, cleanse and amend incorrect website data collection. """ - def __init__(self): - self.data = WEBSITES + def __init__(self, data): + self.data = data def find_items(self, value=4): """ @@ -29,7 +29,7 @@ def amend_domain_values(self, prefix='www.'): return amended -data_extractor = DataExtractor() +data_extractor = DataExtractor(WEBSITES) # print(data_extractor.amend_domain_values()) print(data_extractor.find_items(4)) print(len(data_extractor.find_items(4))) diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py index 7996016..f1fc0bb 100644 --- a/tests/test_data_extractor.py +++ b/tests/test_data_extractor.py @@ -1,8 +1,9 @@ import pytest from data_extractor import DataExtractor +from websites.resources.data import WEBSITES -data_extractor = DataExtractor() +data_extractor = DataExtractor(WEBSITES) class TestDataExtractor: From 8dd27a677a72b9d54fc8f9802afdb71293067140 Mon Sep 17 00:00:00 2001 From: Tomasz-Kluczkowski Date: Tue, 8 Jan 2019 13:22:32 +0000 Subject: [PATCH 5/9] Add tests for find_items. --- tests/test_data_extractor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py index f1fc0bb..e91a0eb 100644 --- a/tests/test_data_extractor.py +++ b/tests/test_data_extractor.py @@ -32,6 +32,12 @@ def test_find_items(self): ] assert data_extractor.find_items() == expected + def test_find_items_none_found(self): + assert data_extractor.find_items(100) == [] + + def test_find_items_all_matching(self): + assert data_extractor.find_items(1) == WEBSITES + def test_amend_domain_values(self): expected = [ { From 304c48b3c276b6ba15633d855bb80221586d5e0e Mon Sep 17 00:00:00 2001 From: Tomasz-Kluczkowski Date: Tue, 8 Jan 2019 13:26:57 +0000 Subject: [PATCH 6/9] Add test to amend_domain_values (confirm original kept intact). --- tests/test_data_extractor.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py index e91a0eb..393d8eb 100644 --- a/tests/test_data_extractor.py +++ b/tests/test_data_extractor.py @@ -94,3 +94,16 @@ def test_amend_domain_values(self): } ] assert data_extractor.amend_domain_values() == expected + + def test_amend_domain_values_retains_original_if_prefix_matching(self): + test_data = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'www.google.co.uk', + 'secure': True, + 'value': 5 + } + ] + _data_extractor = DataExtractor(test_data) + assert _data_extractor.amend_domain_values() == test_data From 179aea4fcdf98931c758a48ea39e9542c3cb82a5 Mon Sep 17 00:00:00 2001 From: Tomasz-Kluczkowski Date: Tue, 8 Jan 2019 14:11:16 +0000 Subject: [PATCH 7/9] Add cleanse_data and tests. Used test_data containing all possible states for ease of testing. --- data_extractor.py | 27 +++++++++++++-- tests/test_data_extractor.py | 65 ++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 3 deletions(-) diff --git a/data_extractor.py b/data_extractor.py index f0558d0..0595ba6 100644 --- a/data_extractor.py +++ b/data_extractor.py @@ -28,8 +28,29 @@ def amend_domain_values(self, prefix='www.'): amended.append(item) return amended + def cleanse_data(self): + """ + Fix errors in "secure" key values. All urls starting with https should be set to "secure": True, those starting + with http "secure": False. + :return: amended: list(dict), amended list of web records. + """ + amended = [] + for item in self.data: + url = item.get('url') + secure = item.get('secure') + # https marked as secure = False + if url and url.startswith('https:') and not item.get('secure'): + item['secure'] = True + # http marked as secure = True + elif url and url.startswith('http:') and item.get('secure'): + item['secure'] = False + amended.append(item) + return amended + + -data_extractor = DataExtractor(WEBSITES) +# data_extractor = DataExtractor(WEBSITES) # print(data_extractor.amend_domain_values()) -print(data_extractor.find_items(4)) -print(len(data_extractor.find_items(4))) +# print(data_extractor.find_items(4)) +# print(len(data_extractor.find_items(4))) +# print(data_extractor.cleanse_data()) diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py index 393d8eb..d1bc922 100644 --- a/tests/test_data_extractor.py +++ b/tests/test_data_extractor.py @@ -107,3 +107,68 @@ def test_amend_domain_values_retains_original_if_prefix_matching(self): ] _data_extractor = DataExtractor(test_data) assert _data_extractor.amend_domain_values() == test_data + + def test_cleanse_data(self): + test_data = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'google.co.uk', + 'secure': False, + 'value': 5 + }, + { + 'name': 'Facebook', + 'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/', + 'domain': 'facebook.com', + 'secure': True, + 'value': 4 + }, + { + 'name': 'Bing', + 'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A', + 'domain': 'bing.com', + 'secure': False, + 'value': 3 + }, + { + 'name': 'Duck Duck Go', + 'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web', + 'domain': 'duckduckgo.com', + 'secure': True, + 'value': 2 + }, + ] + + expected = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'google.co.uk', + 'secure': True, + 'value': 5 + }, + { + 'name': 'Facebook', + 'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/', + 'domain': 'facebook.com', + 'secure': False, + 'value': 4 + }, + { + 'name': 'Bing', + 'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A', + 'domain': 'bing.com', + 'secure': False, + 'value': 3 + }, + { + 'name': 'Duck Duck Go', + 'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web', + 'domain': 'duckduckgo.com', + 'secure': True, + 'value': 2 + }, + ] + _data_extractor = DataExtractor(test_data) + assert _data_extractor.cleanse_data() == expected From 91f1b0b305e913ea9d9ec7d12982965f878fc55f Mon Sep 17 00:00:00 2001 From: Tomasz-Kluczkowski Date: Tue, 8 Jan 2019 14:22:08 +0000 Subject: [PATCH 8/9] Add get_value_sum and tests. Clean up code a bit. --- data_extractor.py | 36 +++++++++++++++++------------------- tests/test_data_extractor.py | 11 ++++++++++- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/data_extractor.py b/data_extractor.py index 0595ba6..6530e51 100644 --- a/data_extractor.py +++ b/data_extractor.py @@ -1,16 +1,14 @@ -from websites.resources.data import WEBSITES - - class DataExtractor: """ - Use to extract, cleanse and amend incorrect website data collection. + Use to extract, cleanse, sum and amend incorrect website data collection. """ def __init__(self, data): self.data = data def find_items(self, value=4): """ - Find and return a new list of items where key "value" is greater than or equal to parameter value. Default = 4. + Find and return a new list of items where key "value" is greater than or equal to parameter value. + :param value: int, value to find items for, default = 4. :return: list(dict), list of dictionaries matching the above filtering rule. """ return [item for item in self.data if item.get('value') and item.get('value') >= value] @@ -18,7 +16,7 @@ def find_items(self, value=4): def amend_domain_values(self, prefix='www.'): """ Fixes missing parts of the domain names. - :param prefix: str, prefix to add to the domain name. Default = 'www'. + :param prefix: str, prefix to add to the domain name, default = 'www'. :return: amended: list(dict), amended list of web records. """ amended = [] @@ -38,19 +36,19 @@ def cleanse_data(self): for item in self.data: url = item.get('url') secure = item.get('secure') - # https marked as secure = False - if url and url.startswith('https:') and not item.get('secure'): - item['secure'] = True - # http marked as secure = True - elif url and url.startswith('http:') and item.get('secure'): - item['secure'] = False + if url: + # https marked as secure = False + if url.startswith('https:') and not secure: + item['secure'] = True + # http marked as secure = True + elif url.startswith('http:') and secure: + item['secure'] = False amended.append(item) return amended - - -# data_extractor = DataExtractor(WEBSITES) -# print(data_extractor.amend_domain_values()) -# print(data_extractor.find_items(4)) -# print(len(data_extractor.find_items(4))) -# print(data_extractor.cleanse_data()) + def get_value_sum(self): + """ + Returns sum of all value keys in the data set. + :return: int, sum of all value keys in the data set. + """ + return sum([item.get('value', 0) for item in self.data]) diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py index d1bc922..444675c 100644 --- a/tests/test_data_extractor.py +++ b/tests/test_data_extractor.py @@ -1,4 +1,6 @@ -import pytest +# Tests are written for pytest framework. +# use: pip install pytest +# to test: pytest from data_extractor import DataExtractor from websites.resources.data import WEBSITES @@ -172,3 +174,10 @@ def test_cleanse_data(self): ] _data_extractor = DataExtractor(test_data) assert _data_extractor.cleanse_data() == expected + + def test_get_value_sum(self): + assert data_extractor.get_value_sum() == 23 + + def test_get_value_sum_empty_data_set(self): + _data_extractor = DataExtractor([]) + assert _data_extractor.get_value_sum() == 0 From 6c53d932bb7045aae5de7fbb753adbdc51ed7140 Mon Sep 17 00:00:00 2001 From: Tomasz-Kluczkowski Date: Tue, 8 Jan 2019 22:14:23 +0000 Subject: [PATCH 9/9] Small improvements to comments. --- data_extractor.py | 6 +++--- tests/test_data_extractor.py | 4 ---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/data_extractor.py b/data_extractor.py index 6530e51..a3fa47c 100644 --- a/data_extractor.py +++ b/data_extractor.py @@ -8,15 +8,15 @@ def __init__(self, data): def find_items(self, value=4): """ Find and return a new list of items where key "value" is greater than or equal to parameter value. - :param value: int, value to find items for, default = 4. + :param value: int, value to find items for. :return: list(dict), list of dictionaries matching the above filtering rule. """ return [item for item in self.data if item.get('value') and item.get('value') >= value] def amend_domain_values(self, prefix='www.'): """ - Fixes missing parts of the domain names. - :param prefix: str, prefix to add to the domain name, default = 'www'. + Fixes missing parts of the domain names. By default we add missing 'www.'. + :param prefix: str, prefix to add to the domain name. :return: amended: list(dict), amended list of web records. """ amended = [] diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py index 444675c..29a453d 100644 --- a/tests/test_data_extractor.py +++ b/tests/test_data_extractor.py @@ -1,7 +1,3 @@ -# Tests are written for pytest framework. -# use: pip install pytest -# to test: pytest - from data_extractor import DataExtractor from websites.resources.data import WEBSITES