Skip to content

Commit bb76e1c

Browse files
add some date preprocessing
1 parent 523d190 commit bb76e1c

File tree

13 files changed

+111
-5
lines changed

13 files changed

+111
-5
lines changed

python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@ def apply(self, pipeline: IngestionPipeline):
1717

1818
if isinstance(pipeline.raw_data, DataFrame):
1919
pipeline.df = pipeline.raw_data
20+
# check if raw data is a path to a csv file and read it into csv
2021
else:
2122
raise DataSourceError

python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ def apply(self, pipeline: IngestionPipeline):
2727
) # noqa
2828

2929
pipeline.column_type_map = column_types
30+
if "unknown" in pipeline.column_type_map.values():
31+
pipeline.needs_type_map = True
32+
33+
pipeline.type_collections = self.build_type_collections(column_types)
3034

3135
def analyze_column(self, column: Series):
3236
"""
@@ -50,7 +54,7 @@ def analyze_column(self, column: Series):
5054
column_type = "datetime"
5155

5256
if column_type is None:
53-
column_type = "object"
57+
column_type = "unknown"
5458

5559
return column_type
5660

@@ -110,7 +114,7 @@ def datetime_check(self, column: Series):
110114
except Exception as e: # noqa
111115
pass
112116

113-
# if format of values look like dates
117+
# if format of values looks like dates
114118

115119
return False
116120

@@ -124,3 +128,14 @@ def id_check(self, types, values):
124128
return all([item == int for item in set(types) if item is not None]) and len(
125129
set(values)
126130
) == len(self.df)
131+
132+
@staticmethod
133+
def build_type_collections(column_type_map):
134+
collections = {}
135+
136+
for data_type in ["datetime", "numeric", "categorical"]:
137+
collections[data_type] = [
138+
col for col in column_type_map if column_type_map[col] == data_type
139+
]
140+
141+
return collections

python/src/lazylearn/lazylearn.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,28 @@
11
from ingestion.ingestion_pipeline import Ingestion
2+
from preprocessing.time.date_processor import date_processor
23

34

45
class LazyLearner:
56
def __init__(self):
67
self.dataset = None
8+
self.task = None
9+
self.models = None
10+
self.leaderboard = None
711

812
def create_project(self, data, target, task="infer"):
913
# ingest data
10-
self.dataset = Ingestion().run(data) # noqa
14+
self.dataset = Ingestion().run(data)
15+
16+
if task == "infer":
17+
# if target is numeric then regression, else classification
18+
if self.dataset.column_type_map[target] == "numeric":
19+
self.task = "regression"
20+
else:
21+
self.task = "classification"
22+
23+
# process dates
24+
25+
self.dataset = date_processor(self.dataset)
1126

1227
# preprocess
1328

python/src/lazylearn/models/models.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,19 @@
22

33

44
class Dataset:
5-
def __init__(self, df: DataFrame, column_type_map: dict):
5+
def __init__(
6+
self,
7+
df: DataFrame,
8+
column_type_map: dict,
9+
summary_stats: dict,
10+
type_collections: dict,
11+
):
612
self.name = None
713
self.description = None
814
self.df = df
915
self.column_type_map = column_type_map
16+
self.summary_stats = summary_stats
17+
self.type_collections = type_collections
1018

1119
def save(self):
1220
raise NotImplementedError

python/src/lazylearn/pipeline/pipeline.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,23 @@ def __init__(self):
2929
self.df: DataFrame = None
3030
self.column_type_map: dict = None
3131
self.summary_stats: dict = {}
32+
self.needs_type_map: bool = False
33+
self.type_collections: dict = None
3234

3335
def response(self):
34-
return Dataset(df=self.df, column_type_map=self.column_type_map)
36+
return Dataset(
37+
df=self.df,
38+
column_type_map=self.column_type_map,
39+
summary_stats=self.summary_stats,
40+
type_collections=self.type_collections,
41+
)
42+
43+
44+
class ModelPipeline(Pipeline):
45+
def __init__(self):
46+
super().__init__()
47+
48+
49+
class RegressionPipeline(ModelPipeline):
50+
def __init__(self):
51+
super().__init__()

python/src/lazylearn/preprocessing/time/__init__.py

Whitespace-only changes.
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from models.models import Dataset
2+
3+
4+
def date_processor(dataset: Dataset) -> Dataset:
5+
"""
6+
Method that transform date variables into
7+
categorical features.
8+
9+
:param dataset: Dataset object with date features
10+
:return: Dataset object with categorical date
11+
features
12+
"""
13+
new_categorical_cols = []
14+
15+
for date_column in dataset.type_collections["datetime"]:
16+
dataset.df[f"{date_column}_year"] = (
17+
dataset.df[date_column].dt.isocalendar().year
18+
)
19+
dataset.df[f"{date_column}_month"] = dataset.df[date_column].dt.month
20+
dataset.df[f"{date_column}_week"] = (
21+
dataset.df[date_column].dt.isocalendar().week
22+
)
23+
dataset.df[f"{date_column}_day"] = dataset.df[date_column].dt.isocalendar().day
24+
25+
new_categorical_cols.append(f"{date_column}_year")
26+
new_categorical_cols.append(f"{date_column}_month")
27+
new_categorical_cols.append(f"{date_column}_week")
28+
new_categorical_cols.append(f"{date_column}_day")
29+
30+
for cat in new_categorical_cols:
31+
dataset.column_type_map[cat] = "categorical"
32+
dataset.type_collections["categorical"].append(cat)
33+
34+
return dataset

python/src/lazylearn/regression/__init__.py

Whitespace-only changes.

python/src/lazylearn/regression/models/__init__.py

Whitespace-only changes.

python/src/lazylearn/regression/models/randomforest/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)