|
| 1 | +# Item 58: Profile before optimizing |
| 2 | +from random import randint |
| 3 | +from profile import Profile |
| 4 | +from pstats import Stats |
| 5 | +from bisect import bisect_left |
| 6 | + |
| 7 | + |
| 8 | +# The dynamic nature of Python causes surprising behaviors in its runtime |
| 9 | +# performance. Operations you might assume are slow are actually very fast |
| 10 | +# (string manipulation, generators). Language features you might assume are |
| 11 | +# fast are actually very slow (attribute access, function calls). The true |
| 12 | +# source of slowdowns in a Python program can be obscure. |
| 13 | + |
| 14 | +# The best approach is to ignore your intuition and directly measure the |
| 15 | +# performance of a program before you try to optimize it. Python provides a |
| 16 | +# built-in profiler for determining which parts of a program are responsible |
| 17 | +# for its execution time. This lets you focus your optimization efforts on |
| 18 | +# the biggest sources of trouble and ignore parts of the program that don't |
| 19 | +# impact speed. |
| 20 | + |
| 21 | +# For example, say you want to determine why an algorithm in your program is |
| 22 | +# slow. Here, I define a function that sorts a list of data using an insertion |
| 23 | +# sort. |
| 24 | + |
| 25 | + |
| 26 | +def insertion_sort(data): |
| 27 | + result = [] |
| 28 | + for value in data: |
| 29 | + insert_value(result, value) |
| 30 | + return result |
| 31 | + |
| 32 | + |
| 33 | +# The core mechanism of the insertion sort is the function that finds the |
| 34 | +# insertion point for each piece of data. Here, I define an extremely |
| 35 | +# inefficient version of the insert_value function that does a linear scan |
| 36 | +# over the input array: |
| 37 | + |
| 38 | + |
| 39 | +def insert_value(array, value): |
| 40 | + for i, existing in enumerate(array): |
| 41 | + if existing > value: |
| 42 | + array.insert(i, value) |
| 43 | + return |
| 44 | + array.append(value) |
| 45 | + |
| 46 | + |
| 47 | +# To profile insertion_sort and insert_value, I create a data set of random |
| 48 | +# numbers and define a test function to pass to the profiler. |
| 49 | + |
| 50 | +max_size = 10**4 |
| 51 | +data = [randint(0, max_size) for _ in range(max_size)] |
| 52 | +print(data) |
| 53 | +test = lambda: insertion_sort(data) |
| 54 | + |
| 55 | +# Python provides two built-in profilers, one that is pure Python (profile) |
| 56 | +# and another that is a C-extension module (cProfile). The cProfile built-in |
| 57 | +# module is better because of its minimal impact on the performance of your |
| 58 | +# program while it's being profiled. The pure-Python alternative imposes a |
| 59 | +# high overhead that will skew the results. |
| 60 | + |
| 61 | +# Note |
| 62 | +# When profiling a Python program, be sure that what you're measuring is the |
| 63 | +# code itself and not any external systems. Beware of functions that access |
| 64 | +# the network or resources on disk. These may appear to have a large impact on |
| 65 | +# your program's execution time because of the slowness of the underlying |
| 66 | +# system. If your program uses a cache to mask the latency of slow resources |
| 67 | +# like these, you should also ensure that it's properly warmed up before you |
| 68 | +# start profiling. |
| 69 | + |
| 70 | +# Here, I instantiate a Profile object from the cProfile module and run the |
| 71 | +# test function through it suing the runcall method: |
| 72 | + |
| 73 | +profiler = Profile() |
| 74 | +profiler.runcall(test) |
| 75 | + |
| 76 | +# Once the test function has finished running, I can extract statistics about |
| 77 | +# its performance using the pstats built-in module and its Stats class. |
| 78 | +# Various methods on a Stats object adjust how to select and sort the |
| 79 | +# profiling information to show only the things you care about. |
| 80 | + |
| 81 | +stats = Stats(profiler) |
| 82 | +stats.strip_dirs() |
| 83 | +stats.sort_stats('cumulative') |
| 84 | +stats.print_stats() |
| 85 | + |
| 86 | +# The output is a table of information organized by function. The data sample |
| 87 | +# is taken only from the time the profiler was active, during the runcall |
| 88 | +# method above. |
| 89 | + |
| 90 | +# 20004 function calls in 1.495 seconds |
| 91 | +# |
| 92 | +# Ordered by: cumulative time |
| 93 | +# |
| 94 | +# ncalls tottime percall cumtime percall filename:lineno(function) |
| 95 | +# 1 0.000 0.000 1.495 1.495 profile:0(<function <lambda> at 0x7f0b2a8af048>) |
| 96 | +# 1 0.000 0.000 1.495 1.495 item_58_profile_before_optimizing.py:52(<lambda>) |
| 97 | +# 1 0.018 0.018 1.495 1.495 item_58_profile_before_optimizing.py:25(insertion_sort) |
| 98 | +# 10000 1.454 0.000 1.477 0.000 item_58_profile_before_optimizing.py:38(insert_value) |
| 99 | +# 9992 0.023 0.000 0.023 0.000 :0(insert) |
| 100 | +# 1 0.000 0.000 0.000 0.000 :0(setprofile) |
| 101 | +# 8 0.000 0.000 0.000 0.000 :0(append) |
| 102 | +# 0 0.000 0.000 profile:0(profiler) |
| 103 | + |
| 104 | +# Here's a quick guide to what the profiler statistics columns mean: |
| 105 | +# 1. ncalls: The number of calls to the function during the profiling period. |
| 106 | +# 2. tottime: The number of seconds spent executing the function, excluding |
| 107 | +# time spent executing other functions it calls. |
| 108 | +# 3. totime percall: The average number of seconds spents in the function each |
| 109 | +# time it was called, executing time spent executing other functions it |
| 110 | +# calls. This is tottime divided by ncalls. |
| 111 | +# 4. cumtime: The cumulative number of seconds spent executing the function, |
| 112 | +# including time spent in all other function it calls. |
| 113 | +# 5. cumtime percall: The average number of seconds spent in the function each |
| 114 | +# time it was called, including time spent in all other functions it calls. |
| 115 | +# This is cumtime divided by ncalls. |
| 116 | + |
| 117 | +# Looking at the profiler statistics table above, I can see that the biggest |
| 118 | +# use of CPU in my test is the cumulative time spent in the insert_value |
| 119 | +# function. Here, I redefine that function to use the bisect built-in module |
| 120 | +# (see Item 46: "Use built-in algorithms and data structures"): |
| 121 | + |
| 122 | + |
| 123 | +def insert_value_bi(array, value): |
| 124 | + i = bisect_left(array, value) |
| 125 | + array.insert(i, value) |
| 126 | + |
| 127 | + |
| 128 | +# I can run the profiler again and generate a new table of profiler |
| 129 | +# statistics. The new function is much faster, with a cumulative time spent |
| 130 | +# that is nearly 100 times smaller than the previous insert_value function. |
| 131 | + |
| 132 | + |
| 133 | +def insertion_sort_bi(data): |
| 134 | + result = [] |
| 135 | + for value in data: |
| 136 | + insert_value_bi(result, value) |
| 137 | + return result |
| 138 | + |
| 139 | + |
| 140 | +data = [randint(0, max_size) for _ in range(max_size)] |
| 141 | +print(data) |
| 142 | +test_bi = lambda: insertion_sort_bi(data) |
| 143 | +profiler_bi = Profile() |
| 144 | +profiler_bi.runcall(test_bi) |
| 145 | +stats_bi = Stats(profiler_bi) |
| 146 | +stats_bi.strip_dirs() |
| 147 | +stats_bi.sort_stats('cumulative') |
| 148 | +stats_bi.print_stats() |
| 149 | +# 30004 function calls in 0.075 seconds |
| 150 | +# |
| 151 | +# Ordered by: cumulative time |
| 152 | +# |
| 153 | +# ncalls tottime percall cumtime percall filename:lineno(function) |
| 154 | +# 1 0.000 0.000 0.075 0.075 profile:0(<function <lambda> at 0x7f6dbd4861e0>) |
| 155 | +# 1 0.000 0.000 0.075 0.075 item_58_profile_before_optimizing.py:142(<lambda>) |
| 156 | +# 1 0.014 0.014 0.075 0.075 item_58_profile_before_optimizing.py:133(insertion_sort_bi) |
| 157 | +# 10000 0.028 0.000 0.061 0.000 item_58_profile_before_optimizing.py:123(insert_value_bi) |
| 158 | +# 10000 0.019 0.000 0.019 0.000 :0(insert) |
| 159 | +# 10000 0.014 0.000 0.014 0.000 :0(bisect_left) |
| 160 | +# 1 0.000 0.000 0.000 0.000 :0(setprofile) |
| 161 | +# 0 0.000 0.000 profile:0(profiler) |
| 162 | + |
| 163 | +# Sometimes, when you're profiling an entire program, you'll find that a |
| 164 | +# common utility function is responsible for the majority of execution time. |
| 165 | +# The default output from the profiler makes this situation difficult to |
| 166 | +# understand because it doesn't show how the utility function is called by |
| 167 | +# many different parts of your program. |
| 168 | + |
| 169 | +# For example, here the my_utility function is called repeatedly by two |
| 170 | +# different functions in the program: |
| 171 | + |
| 172 | + |
| 173 | +def my_utility(a, b): |
| 174 | + if a > b: |
| 175 | + return a + b |
| 176 | + else: |
| 177 | + return a * b |
| 178 | + |
| 179 | + |
| 180 | +def first_func(): |
| 181 | + for _ in range(100000): |
| 182 | + my_utility(4, 5) |
| 183 | + |
| 184 | + |
| 185 | +def second_func(): |
| 186 | + for _ in range(1000): |
| 187 | + my_utility(3, 1) |
| 188 | + |
| 189 | + |
| 190 | +def my_program(): |
| 191 | + for _ in range(20): |
| 192 | + first_func() |
| 193 | + second_func() |
| 194 | + |
| 195 | + |
| 196 | +# Profiling this code and using the default print_stats output will generate |
| 197 | +# output statistics that are confusing. |
| 198 | + |
| 199 | +profiler_my = Profile() |
| 200 | +profiler_my.runcall(my_program) |
| 201 | +stats_my = Stats(profiler_my) |
| 202 | +stats_my.strip_dirs() |
| 203 | +stats_my.sort_stats('cumulative') |
| 204 | +stats_my.print_stats() |
| 205 | +# 2020043 function calls in 3.648 seconds |
| 206 | +# |
| 207 | +# Ordered by: cumulative time |
| 208 | +# |
| 209 | +# ncalls tottime percall cumtime percall filename:lineno(function) |
| 210 | +# 1 0.000 0.000 3.648 3.648 profile:0(<function my_program at 0x7fbbebba8400>) |
| 211 | +# 1 0.000 0.000 3.648 3.648 item_58_profile_before_optimizing.py:190(my_program) |
| 212 | +# 20 2.005 0.100 3.612 0.181 item_58_profile_before_optimizing.py:180(first_func) |
| 213 | +# 2020000 1.623 0.000 1.623 0.000 item_58_profile_before_optimizing.py:173(my_utility) |
| 214 | +# 20 0.020 0.001 0.035 0.002 item_58_profile_before_optimizing.py:185(second_func) |
| 215 | +# 1 0.000 0.000 0.000 0.000 :0(setprofile) |
| 216 | +# 0 0.000 0.000 profile:0(profiler) |
| 217 | + |
| 218 | +# The my_utility function is clearly the source of most execution time, but |
| 219 | +# it's not immediately obvious why that function is called so much. If you |
| 220 | +# search through the program's code, you'll find multiple call sites for |
| 221 | +# my_utility and still be confused. |
| 222 | + |
| 223 | +# To deal with this, the Python profiler provides a way of seeing which |
| 224 | +# callers contributed to the profiling information of each function. |
| 225 | + |
| 226 | +stats_my.print_callers() |
| 227 | + |
| 228 | +# This profiler statistics table shows functions called on the left and who |
| 229 | +# was responsible for making the call on the right. Here, it's clear that |
| 230 | +# my_utility is most used by first_func: |
| 231 | + |
| 232 | +# Ordered by: cumulative time |
| 233 | +# |
| 234 | +# Function was called by... |
| 235 | +# profile:0(<function my_program at 0x7f2b11841400>) <- profile:0(profiler)(1) 0.000 |
| 236 | +# item_58_profile_before_optimizing.py:190(my_program) <- profile:0(<function my_program at 0x7f2b11841400>)(1) 3.869 |
| 237 | +# item_58_profile_before_optimizing.py:180(first_func) <- item_58_profile_before_optimizing.py:190(my_program)(20) 3.869 |
| 238 | +# item_58_profile_before_optimizing.py:173(my_utility) <- item_58_profile_before_optimizing.py:180(first_func)(2000000) 3.831 |
| 239 | +# item_58_profile_before_optimizing.py:185(second_func)(20000) 0.037 |
| 240 | +# item_58_profile_before_optimizing.py:185(second_func) <- item_58_profile_before_optimizing.py:190(my_program)(20) 3.869 |
| 241 | +# :0(setprofile) <- profile:0(<function my_program at 0x7f2b11841400>)(1) 3.869 |
| 242 | +# profile:0(profiler) <- |
| 243 | + |
| 244 | +# Things to remember |
| 245 | + |
| 246 | +# 1. It's import to profile Python programs before optimizing because the |
| 247 | +# source of slowdowns is often obscure. |
| 248 | +# 2. Use the cProfile module instead of the profile module because it provides |
| 249 | +# more accurate profiling information. |
| 250 | +# 3. The Profile object's runcall method provides everything you need to |
| 251 | +# profile a tree of function calls in isolation. |
| 252 | +# 4. The Stats object lets you select and print the subset of profiling |
| 253 | +# information you need to see to understand your program's performance. |
0 commit comments