Merge pull request #30 from alberthu16/day41

Albert Hu · web-flow · commit 62df6ea9d67d · 2017-05-28T02:48:12.000-07:00
Day 41: Redo topKFrequent using a heap for efficiency
diff --git a/day41/README.md b/day41/README.md
@@ -0,0 +1,43 @@
+[Today's challenge is actually a follow-up on day37's challenge -- using a heap instead of radix sort]
+
+Question of the day: https://leetcode.com/problems/top-k-frequent-elements/#/description
+
+Given a non-empty array of integers, return the k most frequent elements.
+
+For example,  
+Given `[1,1,1,2,2,3]` and `k = 2`, return `[1,2]`.
+
+Note:   
+* You may assume k is always valid, 1 ≤ k ≤ number of unique elements.  
+* Your algorithm's time complexity must be better than O(n log n), where
+  n is the array's size.
+
+## Ideas
+
+Can't do a normal sort, since that alone will take `O(nlogn)` runtime.
+The input array isn't sorted, so we need to keep track of a count and organize
+that count somehow as we iterate through the integers in the array. There
+doesn't seem to be any constraints on the types of integers on the array,
+so I'll assume that possible elements in the array range from -maxInt to maxInt
+. 
+
+I think I can actually use radix sort again. Same idea as the challenge from
+[Day 36](../day36).
+
+## Code
+[Day 37 - Python](../day37/topKFrequent.py)
+
+## Follow-up
+
+Over the past few days ([38](../day38), [39](../day39), [40](../day40)), I got a
+little more familiar with the heap data structure and finally understand why
+heapifying an unsorted array can be done in linear time. The operations involved
+in heapify decrease exponentially over a logarithmic range, resulting in an overall
+linear amount of work. Anyways, I can use this `O(n)` time to heapify an unsorted
+array of frequencies, and then pop off the top `k` frequencies in `O(klogn)` time.
+The overall runtime would now be at most `O(nlogn)` if `k` == `n`. However, the
+real savings is in the `O(n)` space for storing all the elements of the heap.
+Much better than the `O(max value of the input array)` I had before.
+
+## Code
+[Day 41 - Python](./topKFrequen.py)
diff --git a/day41/topKFrequent.py b/day41/topKFrequent.py
@@ -0,0 +1,105 @@
+## A sample heap data structure ##
+from collections import deque
+
+class MaxHeap:
+    def __init__(self, arr=[]):
+        self.heap = deque()
+        self.size = 0
+        if len(arr) > 0:
+            self.size = len(arr)
+            self.heapify(arr)
+
+    # runtime: O(logn) aka the height of the heap
+    def getMax(self):
+        if self.size > 0:
+            ret = self.heap.popleft()
+            self.size -= 1
+            if self.size > 0:
+                self.heap.appendleft(self.heap.pop())
+                self.bubbleDown(0)
+            return ret
+
+    # runtime: O(1) 
+    def peek(self):
+        if self.size > 0:
+            return self.heap[0]
+
+    # runtime: O(logn) aka the height of the heap
+    def push(self, val):
+        self.size += 1
+        self.heap.append(val)
+        self.bubbleUp()
+
+    # runtime: O(nlogn)
+    def heapify(self, arr):
+        self.heap = deque(arr)
+        for i in xrange(self.size-1, -1, -1):
+            self.bubbleDown(i)
+
+    # runtime: O(1)
+    def isEmpty(self):
+        return self.size == 0
+
+    def bubbleDown(self, index):
+        if self.size > 0:
+            i = index
+            h = self.heap
+            withinBounds = 2*i + 2 < self.size
+            while withinBounds and (h[i][1] < h[2*i + 1][1] or h[i][1] < h[2*i + 2][1]):
+                if h[i][1] < h[2*i + 1][1] and h[i][1] < h[2*i + 2][1]:
+                    if h[2*i + 1][1] > h[2*i + 2][1]:
+                        h[i], h[2*i + 1] = h[2*i + 1], h[i]
+                        i = 2*i + 1
+                    else:
+                        h[i], h[2*i + 2] = h[2*i + 2], h[i]
+                        i = 2*i + 2
+                elif h[i][1] < h[2*i + 1][1]:
+                    h[i], h[2*i + 1] = h[2*i + 1], h[i]
+                    i = 2*i + 1
+                elif h[i][1] < h[2*i + 2][1]:
+                    h[i], h[2*i + 2] = h[2*i + 2], h[i]
+                    i = 2*i + 2
+                withinBounds = 2*i + 2 < self.size
+
+            if 2*i + 1 < self.size and h[i][1] < h[2*i + 1][1]:
+                h[i], h[2*i + 1] = h[2*i + 1], h[i]
+            elif 2*i + 2 < self.size and h[i][1] < h[2*i + 2][1]:
+                h[i], h[2*i + 2] = h[2*i + 2], h[i]
+
+    def bubbleUp(self):
+        if self.size > 0:
+            i = self.size-1
+            h = self.heap
+            withinBounds = i/2 >= 0
+            while withinBounds and (h[i] > h[i/2]):
+                h[i/2], h[i] = h[i], h[i/2]
+                i /= 2
+                withinBounds = i/2 >= 0
+
+from collections import Counter
+
+def topKFrequent(nums, k):
+    freqs = Counter(nums)
+    h = MaxHeap(freqs.items())
+    ret = list()
+    while k > 0:
+        ret.append(h.getMax()[0])
+        k -=1
+    return ret
+
+def testTopKFrequent():
+    assert set(topKFrequent([], 0)) == set([])
+    assert set(topKFrequent([1], 1)) == set([1])
+    assert set(topKFrequent([-1, -1], 1)) == set([-1])
+    assert set(topKFrequent([1,1,1,2,2,3], 2)) == set([1, 2])
+    assert set(topKFrequent([-1,-1,-1,2,2,3], 2)) == set([-1, 2])
+    assert set(topKFrequent([1,1,1,2,2,3], 3)) == set([1, 2, 3])
+    assert set(topKFrequent([1,1,1,2,2,2,3,3,3], 3)) == set([1, 2, 3])
+    assert set(topKFrequent([4,1,-1,2,-1,2,3], 2)) == set([-1, 2])
+    assert set(topKFrequent([3,2,3,1,2,4,5,5,6,7,7,8,2,3,1,1,1,10,11,5,6,2,4,7,8,5,6], 10)) == set([1,2,5,3,7,6,4,8,10,11])
+
+def main():
+    testTopKFrequent()
+
+if __name__ == "__main__":
+    main()