1+ import pandas as pd
2+ import numpy as np
3+ import warnings
4+ from typing import Dict , List , Any , Union , Optional , Tuple
5+ import math
6+
7+ class DataAnalyser :
8+ """Utility class for analyzing datasets and providing statistical insights."""
9+
10+ @classmethod
11+ def analyse (cls , df : pd .DataFrame , correlation_threshold : float = 0.7 ) -> Dict [str , Any ]:
12+ """
13+ Analyze a DataFrame and extract useful statistics and insights.
14+
15+ Args:
16+ df: Input DataFrame to analyze
17+ correlation_threshold: Threshold for identifying strong correlations
18+
19+ Returns:
20+ Dictionary containing analysis results
21+ """
22+ print ("Analyzing data..." )
23+
24+ # Initialize results structure
25+ results = {"columns" : [],
26+ "grp_columns" : {},
27+ "statistical_analysis" : {},
28+ "cross_row_relationship" : {},
29+ "cross_column_relationship" : {}
30+ }
31+
32+ # Categorize columns
33+ results ["grp_columns" ] = cls .categorize_columns (df )
34+ results ["columns" ]= df .columns .tolist ()
35+
36+ # Analyze each type of column
37+ stats = {}
38+ if results ["grp_columns" ]["numeric" ]:
39+ stats ["numeric" ] = cls .analyze_numeric_columns (df , results ["grp_columns" ]["numeric" ])
40+
41+ if results ["grp_columns" ]["categorical" ]:
42+ stats ["categorical" ] = cls .analyze_categorical_columns (df , results ["grp_columns" ]["categorical" ])
43+
44+ if results ["grp_columns" ]["datetime" ]:
45+ stats ["datetime" ] = cls .analyze_datetime_columns (df , results ["grp_columns" ]["datetime" ])
46+
47+ results ["statistical_analysis" ] = stats
48+
49+ # Analyze cross-row relationships
50+ results ["cross_row_relationship" ] = cls .analyze_cross_row_relationships (df )
51+
52+ # Analyze cross-column relationships
53+ if results ["grp_columns" ]["numeric" ] and len (results ["grp_columns" ]["numeric" ]) > 1 :
54+ results ["cross_column_relationship" ] = cls .analyze_cross_column_relationships (
55+ df , results ["grp_columns" ]["numeric" ], correlation_threshold
56+ )
57+
58+ return results
59+
60+ @classmethod
61+ def categorize_columns (cls , df : pd .DataFrame ) -> Dict [str , List [str ]]:
62+ """
63+ Categorize DataFrame columns by their data types.
64+
65+ Args:
66+ df: Input DataFrame
67+
68+ Returns:
69+ Dictionary mapping column types to lists of column names
70+ """
71+ result = {
72+ "numeric" : [],
73+ "categorical" : [],
74+ "datetime" : [],
75+ "text" : [],
76+ "other" : []
77+ }
78+
79+ for col in df .columns :
80+ column = df [col ]
81+
82+ # Check if already datetime type - most reliable method
83+ if pd .api .types .is_datetime64_any_dtype (column ):
84+ result ["datetime" ].append (col )
85+
86+ # Check numeric types
87+ elif pd .api .types .is_numeric_dtype (column ) and not pd .api .types .is_bool_dtype (column ):
88+ result ["numeric" ].append (col )
89+
90+ # Check categorical and boolean
91+ elif pd .api .types .is_categorical_dtype (column ) or pd .api .types .is_bool_dtype (column ):
92+ result ["categorical" ].append (col )
93+
94+ # Check for text columns
95+ elif pd .api .types .is_string_dtype (column ) or pd .api .types .is_object_dtype (column ):
96+ # Check if more than 50% of non-null values are likely categorical (few unique values)
97+ non_null_count = column .count ()
98+ if non_null_count > 0 :
99+ unique_ratio = column .nunique () / non_null_count
100+ if unique_ratio < 0.2 : # If less than 20% of values are unique, consider categorical
101+ result ["categorical" ].append (col )
102+ else :
103+ result ["text" ].append (col )
104+ else :
105+ result ["text" ].append (col )
106+
107+ # Everything else
108+ else :
109+ result ["other" ].append (col )
110+
111+ # Verify all columns are categorized
112+ categorized = []
113+ for category , cols in result .items ():
114+ categorized .extend (cols )
115+
116+ missing = set (df .columns ) - set (categorized )
117+ if missing :
118+ print (f"Found uncategorized columns: { missing } " )
119+ result ["other" ].extend (list (missing ))
120+
121+ return result
122+
123+ @classmethod
124+ def analyze_numeric_columns (cls , df : pd .DataFrame , numeric_columns : List [str ]) -> Dict [str , Dict [str , Any ]]:
125+ """
126+ Analyze numeric columns to extract statistical information.
127+
128+ Args:
129+ df: Input DataFrame
130+ numeric_columns: List of numeric column names
131+
132+ Returns:
133+ Dictionary mapping column names to their statistics
134+ """
135+ result = {}
136+
137+ for col in numeric_columns :
138+ # Skip columns with all NaN values
139+ if df [col ].isna ().all ():
140+ continue
141+
142+ stats = {}
143+
144+ # Basic statistics
145+ stats ["count" ] = int (df [col ].count ())
146+ stats ["mean" ] = float (df [col ].mean ())
147+ stats ["median" ] = float (df [col ].median ())
148+ stats ["std" ] = float (df [col ].std ())
149+ stats ["min" ] = float (df [col ].min ())
150+ stats ["max" ] = float (df [col ].max ())
151+
152+ # Calculate percentiles
153+ for p in [25 , 75 , 90 , 95 , 99 ]:
154+ stats [f"p{ p } " ] = float (df [col ].quantile (p / 100 ))
155+
156+ # Null value statistics
157+ null_count = int (df [col ].isna ().sum ())
158+ stats ["null_count" ] = null_count
159+ stats ["null_percentage" ] = float ((null_count / len (df )) * 100 )
160+
161+ result [col ] = stats
162+
163+ return result
164+
165+ @classmethod
166+ def analyze_categorical_columns (cls , df : pd .DataFrame , categorical_columns : List [str ]) -> Dict [str , Dict [str , Any ]]:
167+ """
168+ Analyze categorical columns to extract distribution information.
169+
170+ Args:
171+ df: Input DataFrame
172+ categorical_columns: List of categorical column names
173+
174+ Returns:
175+ Dictionary mapping column names to their statistics
176+ """
177+ result = {}
178+
179+ for col in categorical_columns :
180+ # Skip columns with all NaN values
181+ if df [col ].isna ().all ():
182+ continue
183+
184+ stats = {}
185+
186+ # Basic statistics
187+ stats ["count" ] = int (df [col ].count ())
188+ stats ["unique_count" ] = int (df [col ].nunique ())
189+
190+ # Value distribution (top 10 most common values)
191+ value_counts = df [col ].value_counts ().head (10 ).to_dict ()
192+ # Convert any non-string keys to strings for JSON compatibility
193+ top_values = {}
194+ for k , v in value_counts .items ():
195+ key = str (k ) if not isinstance (k , str ) else k
196+ top_values [key ] = int (v )
197+
198+ stats ["top_values" ] = top_values
199+
200+ # Calculate entropy to measure randomness
201+ counts = df [col ].value_counts ()
202+ probs = counts / counts .sum ()
203+ entropy = - np .sum (probs * np .log2 (probs ))
204+ stats ["entropy" ] = float (entropy )
205+
206+ # Null value statistics
207+ null_count = int (df [col ].isna ().sum ())
208+ stats ["null_count" ] = null_count
209+ stats ["null_percentage" ] = float ((null_count / len (df )) * 100 )
210+
211+ result [col ] = stats
212+
213+ return result
214+
215+ @classmethod
216+ def analyze_datetime_columns (cls , df : pd .DataFrame , datetime_columns : List [str ]) -> Dict [str , Dict [str , Any ]]:
217+ """
218+ Analyze datetime columns to extract temporal patterns.
219+
220+ Args:
221+ df: Input DataFrame
222+ datetime_columns: List of datetime column names
223+
224+ Returns:
225+ Dictionary mapping column names to their statistics
226+ """
227+ result = {}
228+
229+ for col in datetime_columns :
230+ # Skip columns with all NaN values
231+ if df [col ].isna ().all ():
232+ continue
233+
234+ stats = {}
235+
236+ # Basic statistics
237+ stats ["count" ] = int (df [col ].count ())
238+ stats ["min" ] = str (df [col ].min ())
239+ stats ["max" ] = str (df [col ].max ())
240+
241+ # Calculate temporal span
242+ min_date = df [col ].min ()
243+ max_date = df [col ].max ()
244+ if pd .notna (min_date ) and pd .notna (max_date ):
245+ span_days = (max_date - min_date ).total_seconds () / (60 * 60 * 24 )
246+ stats ["span_days" ] = float (span_days )
247+
248+ # Extract date parts distribution
249+ date_parts = {}
250+
251+ # Year distribution
252+ if df [col ].dt .year .nunique () > 1 :
253+ year_counts = df [col ].dt .year .value_counts ().to_dict ()
254+ date_parts ["year" ] = {str (k ): int (v ) for k , v in year_counts .items ()}
255+
256+ # Month distribution
257+ month_counts = df [col ].dt .month .value_counts ().to_dict ()
258+ date_parts ["month" ] = {str (k ): int (v ) for k , v in month_counts .items ()}
259+
260+ # Day of week distribution
261+ dow_counts = df [col ].dt .dayofweek .value_counts ().to_dict ()
262+ date_parts ["day_of_week" ] = {str (k ): int (v ) for k , v in dow_counts .items ()}
263+
264+ # Hour distribution (if time component exists)
265+ if (df [col ].dt .hour != 0 ).any ():
266+ hour_counts = df [col ].dt .hour .value_counts ().to_dict ()
267+ date_parts ["hour" ] = {str (k ): int (v ) for k , v in hour_counts .items ()}
268+
269+ stats ["date_parts" ] = date_parts
270+
271+ # Null value statistics
272+ null_count = int (df [col ].isna ().sum ())
273+ stats ["null_count" ] = null_count
274+ stats ["null_percentage" ] = float ((null_count / len (df )) * 100 )
275+
276+ result [col ] = stats
277+
278+ return result
279+
280+ @classmethod
281+ def analyze_cross_row_relationships (cls , df : pd .DataFrame ) -> Dict [str , Any ]:
282+ """
283+ Analyze relationships across rows, such as duplicates and null patterns.
284+
285+ Args:
286+ df: Input DataFrame
287+
288+ Returns:
289+ Dictionary containing cross-row relationship information
290+ """
291+ result = {}
292+
293+ # Analyze duplicates
294+ duplicates = df .duplicated ()
295+ duplicate_count = int (duplicates .sum ())
296+ duplicate_percentage = float ((duplicate_count / len (df )) * 100 )
297+
298+ result ["duplicates" ] = {
299+ "count" : duplicate_count ,
300+ "percentage" : duplicate_percentage
301+ }
302+
303+ # Analyze rows with null values
304+ rows_with_null = df .isna ().any (axis = 1 )
305+ null_rows_count = int (rows_with_null .sum ())
306+ null_rows_percentage = float ((null_rows_count / len (df )) * 100 )
307+
308+ result ["null_rows" ] = {
309+ "count" : null_rows_count ,
310+ "percentage" : null_rows_percentage
311+ }
312+
313+ return result
314+
315+ @classmethod
316+ def analyze_cross_column_relationships (
317+ cls , df : pd .DataFrame , numeric_columns : List [str ], correlation_threshold : float
318+ ) -> Dict [str , Any ]:
319+ """
320+ Analyze relationships between columns, such as correlations.
321+
322+ Args:
323+ df: Input DataFrame
324+ numeric_columns: List of numeric column names
325+ correlation_threshold: Threshold for identifying strong correlations
326+
327+ Returns:
328+ Dictionary containing cross-column relationship information
329+ """
330+ result = {}
331+
332+ # Calculate correlations between numeric columns
333+ with warnings .catch_warnings ():
334+ warnings .simplefilter ("ignore" )
335+ corr_matrix = df [numeric_columns ].corr ()
336+
337+ # Extract strong correlations (ignore self-correlations)
338+ strong_correlations = {}
339+ for i in range (len (numeric_columns )):
340+ for j in range (i + 1 , len (numeric_columns )):
341+ col1 = numeric_columns [i ]
342+ col2 = numeric_columns [j ]
343+ corr_value = corr_matrix .iloc [i , j ]
344+
345+ # Skip NaN correlations
346+ if pd .isna (corr_value ):
347+ continue
348+
349+ # Store absolute correlation values above threshold
350+ if abs (corr_value ) >= correlation_threshold :
351+ pair_name = f"{ col1 } - { col2 } "
352+ strong_correlations [pair_name ] = float (corr_value )
353+
354+ if strong_correlations :
355+ result ["correlations" ] = strong_correlations
356+
357+ return result
0 commit comments