From ae0ca376d8e0ec93f26e6511d30172c5ae4b59ee Mon Sep 17 00:00:00 2001
From: Tomasz-Kluczkowski <tomaszk1@hotmail.com.uk>
Date: Tue, 8 Jan 2019 12:21:18 +0000
Subject: [PATCH 1/9] Initial commit.

Set up virtual environment.
Created blank code and test files.
---
 data_extractor.py            | 0
 tests/test_data_extractor.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data_extractor.py
 create mode 100644 tests/test_data_extractor.py

diff --git a/data_extractor.py b/data_extractor.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
new file mode 100644
index 0000000..e69de29

From a9c9f2442adff424a297ae0a9a5904dc53bca5fd Mon Sep 17 00:00:00 2001
From: Tomasz-Kluczkowski <tomaszk1@hotmail.com.uk>
Date: Tue, 8 Jan 2019 12:55:35 +0000
Subject: [PATCH 2/9] Add find_items method and initial test.

---
 data_extractor.py            | 21 +++++++++++++++++++++
 tests/__init__.py            |  0
 tests/test_data_extractor.py | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)
 create mode 100644 tests/__init__.py

diff --git a/data_extractor.py b/data_extractor.py
index e69de29..47cb03d 100644
--- a/data_extractor.py
+++ b/data_extractor.py
@@ -0,0 +1,21 @@
+from websites.resources.data import WEBSITES
+
+
+class DataExtractor:
+    """
+    Use to extract, cleanse and amend incorrect website data collection.
+    """
+    def __init__(self):
+        self.data = WEBSITES
+
+    def find_items(self, value):
+        """
+        Find and return a new list of items where key "value" is greater than or equal to parameter value.
+        :return: list(dict), list of dictionaries matching the above filtering rule.
+        """
+        return [item for item in self.data if item.get('value') and item.get('value') >= value]
+
+
+# data_extractor = DataExtractor()
+# print(data_extractor.find_items(4))
+# print(len(data_extractor.find_items(4)))
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
index e69de29..86c11aa 100644
--- a/tests/test_data_extractor.py
+++ b/tests/test_data_extractor.py
@@ -0,0 +1,32 @@
+import pytest
+
+from data_extractor import DataExtractor
+
+data_extractor = DataExtractor()
+
+
+class TestDataExtractor:
+
+    def test_find_items(self):
+        expected = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'google.co.uk',
+                'secure': True,
+                'value': 5},
+            {
+                'name': 'Facebook',
+                'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
+                'domain': 'facebook.com',
+                'secure': True,
+                'value': 4},
+            {
+                'name': 'YouTube',
+                'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc',
+                'domain': 'youtube.com',
+                'secure': True,
+                'value': 5
+            }
+        ]
+        assert data_extractor.find_items(4) == expected

From a16333ab5e08d3e1a569168a44ada88392d230cc Mon Sep 17 00:00:00 2001
From: Tomasz-Kluczkowski <tomaszk1@hotmail.com.uk>
Date: Tue, 8 Jan 2019 13:16:52 +0000
Subject: [PATCH 3/9] Add amend_domain_values and basic test.

---
 data_extractor.py            | 24 ++++++++++++---
 tests/test_data_extractor.py | 59 +++++++++++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 6 deletions(-)

diff --git a/data_extractor.py b/data_extractor.py
index 47cb03d..fa5b5cb 100644
--- a/data_extractor.py
+++ b/data_extractor.py
@@ -8,14 +8,28 @@ class DataExtractor:
     def __init__(self):
         self.data = WEBSITES
 
-    def find_items(self, value):
+    def find_items(self, value=4):
         """
-        Find and return a new list of items where key "value" is greater than or equal to parameter value.
+        Find and return a new list of items where key "value" is greater than or equal to parameter value. Default = 4.
         :return: list(dict), list of dictionaries matching the above filtering rule.
         """
         return [item for item in self.data if item.get('value') and item.get('value') >= value]
 
+    def amend_domain_values(self, prefix='www.'):
+        """
+        Fixes missing parts of the domain names.
+        :param prefix: str, prefix to add to the domain name. Default = 'www'.
+        :return: amended: list(dict), amended list of web records.
+        """
+        amended = []
+        for item in self.data:
+            if item.get('domain') and not item.get('domain').startswith(prefix):
+                item['domain'] = f"{prefix}{item['domain']}"
+            amended.append(item)
+        return amended
+
 
-# data_extractor = DataExtractor()
-# print(data_extractor.find_items(4))
-# print(len(data_extractor.find_items(4)))
+data_extractor = DataExtractor()
+# print(data_extractor.amend_domain_values())
+print(data_extractor.find_items(4))
+print(len(data_extractor.find_items(4)))
diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
index 86c11aa..7996016 100644
--- a/tests/test_data_extractor.py
+++ b/tests/test_data_extractor.py
@@ -29,4 +29,61 @@ def test_find_items(self):
                 'value': 5
             }
         ]
-        assert data_extractor.find_items(4) == expected
+        assert data_extractor.find_items() == expected
+
+    def test_amend_domain_values(self):
+        expected = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'www.google.co.uk',
+                'secure': True,
+                'value': 5},
+            {
+                'name': 'Facebook',
+                'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
+                'domain': 'www.facebook.com',
+                'secure': True, 'value': 4},
+            {
+                'name': 'Bing',
+                'url': 'https://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
+                'domain': 'www.bing.com',
+                'secure': False,
+                'value': 3
+            },
+            {
+                'name': 'Ask',
+                'url': 'https://uk.ask.com/web?o=0&l=dir&qo=serpSearchTopBox&q=jupiter',
+                'domain': 'www.ask.com',
+                'secure': False,
+                'value': 1},
+            {
+                'name': 'Duck Duck Go',
+                'url': 'http://duckduckgo.com/?q=plane&t=h_&ia=web',
+                'domain': 'www.duckduckgo.com',
+                'secure': True,
+                'value': 2
+            },
+            {
+                'name': 'Vimeo',
+                'url': 'https://vimeo.com/53812885',
+                'domain': 'www.vimeo.com',
+                'secure': False,
+                'value': 2
+            },
+            {
+                'name': 'YouTube',
+                'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc',
+                'domain': 'www.youtube.com',
+                'secure': True,
+                'value': 5
+             },
+            {
+                'name': 'Daily Motion',
+                'url': 'http://www.dailymotion.com/search/football',
+                'domain': 'www.dailymotion.com',
+                'secure': True,
+                'value': 1
+            }
+        ]
+        assert data_extractor.amend_domain_values() == expected

From 75c8e82ffa220279655d63347af9acb9b2ccd74f Mon Sep 17 00:00:00 2001
From: Tomasz-Kluczkowski <tomaszk1@hotmail.com.uk>
Date: Tue, 8 Jan 2019 13:18:46 +0000
Subject: [PATCH 4/9] Allow DataExtractor to accept data.

---
 data_extractor.py            | 6 +++---
 tests/test_data_extractor.py | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/data_extractor.py b/data_extractor.py
index fa5b5cb..f0558d0 100644
--- a/data_extractor.py
+++ b/data_extractor.py
@@ -5,8 +5,8 @@ class DataExtractor:
     """
     Use to extract, cleanse and amend incorrect website data collection.
     """
-    def __init__(self):
-        self.data = WEBSITES
+    def __init__(self, data):
+        self.data = data
 
     def find_items(self, value=4):
         """
@@ -29,7 +29,7 @@ def amend_domain_values(self, prefix='www.'):
         return amended
 
 
-data_extractor = DataExtractor()
+data_extractor = DataExtractor(WEBSITES)
 # print(data_extractor.amend_domain_values())
 print(data_extractor.find_items(4))
 print(len(data_extractor.find_items(4)))
diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
index 7996016..f1fc0bb 100644
--- a/tests/test_data_extractor.py
+++ b/tests/test_data_extractor.py
@@ -1,8 +1,9 @@
 import pytest
 
 from data_extractor import DataExtractor
+from websites.resources.data import WEBSITES
 
-data_extractor = DataExtractor()
+data_extractor = DataExtractor(WEBSITES)
 
 
 class TestDataExtractor:

From 8dd27a677a72b9d54fc8f9802afdb71293067140 Mon Sep 17 00:00:00 2001
From: Tomasz-Kluczkowski <tomaszk1@hotmail.com.uk>
Date: Tue, 8 Jan 2019 13:22:32 +0000
Subject: [PATCH 5/9] Add tests for find_items.

---
 tests/test_data_extractor.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
index f1fc0bb..e91a0eb 100644
--- a/tests/test_data_extractor.py
+++ b/tests/test_data_extractor.py
@@ -32,6 +32,12 @@ def test_find_items(self):
         ]
         assert data_extractor.find_items() == expected
 
+    def test_find_items_none_found(self):
+        assert data_extractor.find_items(100) == []
+
+    def test_find_items_all_matching(self):
+        assert data_extractor.find_items(1) == WEBSITES
+
     def test_amend_domain_values(self):
         expected = [
             {

From 304c48b3c276b6ba15633d855bb80221586d5e0e Mon Sep 17 00:00:00 2001
From: Tomasz-Kluczkowski <tomaszk1@hotmail.com.uk>
Date: Tue, 8 Jan 2019 13:26:57 +0000
Subject: [PATCH 6/9] Add test to amend_domain_values (confirm original kept
 intact).

---
 tests/test_data_extractor.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
index e91a0eb..393d8eb 100644
--- a/tests/test_data_extractor.py
+++ b/tests/test_data_extractor.py
@@ -94,3 +94,16 @@ def test_amend_domain_values(self):
             }
         ]
         assert data_extractor.amend_domain_values() == expected
+
+    def test_amend_domain_values_retains_original_if_prefix_matching(self):
+        test_data = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'www.google.co.uk',
+                'secure': True,
+                'value': 5
+            }
+        ]
+        _data_extractor = DataExtractor(test_data)
+        assert _data_extractor.amend_domain_values() == test_data

From 179aea4fcdf98931c758a48ea39e9542c3cb82a5 Mon Sep 17 00:00:00 2001
From: Tomasz-Kluczkowski <tomaszk1@hotmail.com.uk>
Date: Tue, 8 Jan 2019 14:11:16 +0000
Subject: [PATCH 7/9] Add cleanse_data and tests.

Used test_data containing all possible states for ease of testing.
---
 data_extractor.py            | 27 +++++++++++++--
 tests/test_data_extractor.py | 65 ++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/data_extractor.py b/data_extractor.py
index f0558d0..0595ba6 100644
--- a/data_extractor.py
+++ b/data_extractor.py
@@ -28,8 +28,29 @@ def amend_domain_values(self, prefix='www.'):
             amended.append(item)
         return amended
 
+    def cleanse_data(self):
+        """
+        Fix errors in "secure" key values. All urls starting with https should be set to "secure": True, those starting
+        with http "secure": False.
+        :return: amended: list(dict), amended list of web records.
+        """
+        amended = []
+        for item in self.data:
+            url = item.get('url')
+            secure = item.get('secure')
+            # https marked as secure = False
+            if url and url.startswith('https:') and not item.get('secure'):
+                item['secure'] = True
+            # http marked as secure = True
+            elif url and url.startswith('http:') and item.get('secure'):
+                item['secure'] = False
+            amended.append(item)
+        return amended
+
+
 
-data_extractor = DataExtractor(WEBSITES)
+# data_extractor = DataExtractor(WEBSITES)
 # print(data_extractor.amend_domain_values())
-print(data_extractor.find_items(4))
-print(len(data_extractor.find_items(4)))
+# print(data_extractor.find_items(4))
+# print(len(data_extractor.find_items(4)))
+# print(data_extractor.cleanse_data())
diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
index 393d8eb..d1bc922 100644
--- a/tests/test_data_extractor.py
+++ b/tests/test_data_extractor.py
@@ -107,3 +107,68 @@ def test_amend_domain_values_retains_original_if_prefix_matching(self):
         ]
         _data_extractor = DataExtractor(test_data)
         assert _data_extractor.amend_domain_values() == test_data
+
+    def test_cleanse_data(self):
+        test_data = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'google.co.uk',
+                'secure': False,
+                'value': 5
+            },
+            {
+                'name': 'Facebook',
+                'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
+                'domain': 'facebook.com',
+                'secure': True,
+                'value': 4
+            },
+            {
+                'name': 'Bing',
+                'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
+                'domain': 'bing.com',
+                'secure': False,
+                'value': 3
+            },
+            {
+                'name': 'Duck Duck Go',
+                'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web',
+                'domain': 'duckduckgo.com',
+                'secure': True,
+                'value': 2
+            },
+        ]
+
+        expected = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'google.co.uk',
+                'secure': True,
+                'value': 5
+            },
+            {
+                'name': 'Facebook',
+                'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
+                'domain': 'facebook.com',
+                'secure': False,
+                'value': 4
+            },
+            {
+                'name': 'Bing',
+                'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
+                'domain': 'bing.com',
+                'secure': False,
+                'value': 3
+            },
+            {
+                'name': 'Duck Duck Go',
+                'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web',
+                'domain': 'duckduckgo.com',
+                'secure': True,
+                'value': 2
+            },
+        ]
+        _data_extractor = DataExtractor(test_data)
+        assert _data_extractor.cleanse_data() == expected

From 91f1b0b305e913ea9d9ec7d12982965f878fc55f Mon Sep 17 00:00:00 2001
From: Tomasz-Kluczkowski <tomaszk1@hotmail.com.uk>
Date: Tue, 8 Jan 2019 14:22:08 +0000
Subject: [PATCH 8/9] Add get_value_sum and tests.

Clean up code a bit.
---
 data_extractor.py            | 36 +++++++++++++++++-------------------
 tests/test_data_extractor.py | 11 ++++++++++-
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/data_extractor.py b/data_extractor.py
index 0595ba6..6530e51 100644
--- a/data_extractor.py
+++ b/data_extractor.py
@@ -1,16 +1,14 @@
-from websites.resources.data import WEBSITES
-
-
 class DataExtractor:
     """
-    Use to extract, cleanse and amend incorrect website data collection.
+    Use to extract, cleanse, sum and amend incorrect website data collection.
     """
     def __init__(self, data):
         self.data = data
 
     def find_items(self, value=4):
         """
-        Find and return a new list of items where key "value" is greater than or equal to parameter value. Default = 4.
+        Find and return a new list of items where key "value" is greater than or equal to parameter value.
+        :param value: int, value to find items for, default = 4.
         :return: list(dict), list of dictionaries matching the above filtering rule.
         """
         return [item for item in self.data if item.get('value') and item.get('value') >= value]
@@ -18,7 +16,7 @@ def find_items(self, value=4):
     def amend_domain_values(self, prefix='www.'):
         """
         Fixes missing parts of the domain names.
-        :param prefix: str, prefix to add to the domain name. Default = 'www'.
+        :param prefix: str, prefix to add to the domain name, default = 'www'.
         :return: amended: list(dict), amended list of web records.
         """
         amended = []
@@ -38,19 +36,19 @@ def cleanse_data(self):
         for item in self.data:
             url = item.get('url')
             secure = item.get('secure')
-            # https marked as secure = False
-            if url and url.startswith('https:') and not item.get('secure'):
-                item['secure'] = True
-            # http marked as secure = True
-            elif url and url.startswith('http:') and item.get('secure'):
-                item['secure'] = False
+            if url:
+                # https marked as secure = False
+                if url.startswith('https:') and not secure:
+                    item['secure'] = True
+                # http marked as secure = True
+                elif url.startswith('http:') and secure:
+                    item['secure'] = False
             amended.append(item)
         return amended
 
-
-
-# data_extractor = DataExtractor(WEBSITES)
-# print(data_extractor.amend_domain_values())
-# print(data_extractor.find_items(4))
-# print(len(data_extractor.find_items(4)))
-# print(data_extractor.cleanse_data())
+    def get_value_sum(self):
+        """
+        Returns sum of all value keys in the data set.
+        :return: int, sum of all value keys in the data set.
+        """
+        return sum([item.get('value', 0) for item in self.data])
diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
index d1bc922..444675c 100644
--- a/tests/test_data_extractor.py
+++ b/tests/test_data_extractor.py
@@ -1,4 +1,6 @@
-import pytest
+# Tests are written for pytest framework.
+# use: pip install pytest
+# to test: pytest
 
 from data_extractor import DataExtractor
 from websites.resources.data import WEBSITES
@@ -172,3 +174,10 @@ def test_cleanse_data(self):
         ]
         _data_extractor = DataExtractor(test_data)
         assert _data_extractor.cleanse_data() == expected
+
+    def test_get_value_sum(self):
+        assert data_extractor.get_value_sum() == 23
+
+    def test_get_value_sum_empty_data_set(self):
+        _data_extractor = DataExtractor([])
+        assert _data_extractor.get_value_sum() == 0

From 6c53d932bb7045aae5de7fbb753adbdc51ed7140 Mon Sep 17 00:00:00 2001
From: Tomasz-Kluczkowski <tomaszk1@hotmail.com.uk>
Date: Tue, 8 Jan 2019 22:14:23 +0000
Subject: [PATCH 9/9] Small improvements to comments.

---
 data_extractor.py            | 6 +++---
 tests/test_data_extractor.py | 4 ----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/data_extractor.py b/data_extractor.py
index 6530e51..a3fa47c 100644
--- a/data_extractor.py
+++ b/data_extractor.py
@@ -8,15 +8,15 @@ def __init__(self, data):
     def find_items(self, value=4):
         """
         Find and return a new list of items where key "value" is greater than or equal to parameter value.
-        :param value: int, value to find items for, default = 4.
+        :param value: int, value to find items for.
         :return: list(dict), list of dictionaries matching the above filtering rule.
         """
         return [item for item in self.data if item.get('value') and item.get('value') >= value]
 
     def amend_domain_values(self, prefix='www.'):
         """
-        Fixes missing parts of the domain names.
-        :param prefix: str, prefix to add to the domain name, default = 'www'.
+        Fixes missing parts of the domain names. By default we add missing 'www.'.
+        :param prefix: str, prefix to add to the domain name.
         :return: amended: list(dict), amended list of web records.
         """
         amended = []
diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
index 444675c..29a453d 100644
--- a/tests/test_data_extractor.py
+++ b/tests/test_data_extractor.py
@@ -1,7 +1,3 @@
-# Tests are written for pytest framework.
-# use: pip install pytest
-# to test: pytest
-
 from data_extractor import DataExtractor
 from websites.resources.data import WEBSITES