Skip to content

Commit f3fbd0d

Browse files
durations and simple splits
1 parent d7fca34 commit f3fbd0d

File tree

8 files changed

+61
-2
lines changed

8 files changed

+61
-2
lines changed

python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import pandas as pd
12
from errors.errors import DataSourceError
3+
from ingestion.utils.csv import csv_check
24
from pandas import DataFrame
35
from pipeline.pipeline import IngestionPipeline, PipelineStep
46

@@ -18,5 +20,7 @@ def apply(self, pipeline: IngestionPipeline):
1820
if isinstance(pipeline.raw_data, DataFrame):
1921
pipeline.df = pipeline.raw_data
2022
# check if raw data is a path to a csv file and read it into csv
23+
elif csv_check(pipeline.df):
24+
pipeline.df = pd.read_csv(pipeline.raw_data)
2125
else:
2226
raise DataSourceError

python/src/lazylearn/ingestion/utils/__init__.py

Whitespace-only changes.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
def csv_check(path):
2+
raise NotImplementedError

python/src/lazylearn/lazylearn.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
from ingestion.ingestion_pipeline import Ingestion
2+
from model_selection.splitters import test_train_splitter
23
from preprocessing.time.date_processor import date_processor
4+
from preprocessing.time.duration import duration_builder
35

46

57
class LazyLearner:
6-
def __init__(self):
8+
def __init__(self, random_state=None):
79
self.dataset = None
810
self.task = None
911
self.models = None
1012
self.leaderboard = None
13+
self.random_state = random_state
1114

1215
def create_project(self, data, target, task="infer"):
1316
# ingest data
@@ -23,11 +26,19 @@ def create_project(self, data, target, task="infer"):
2326
# process dates
2427

2528
self.dataset = date_processor(self.dataset)
29+
self.dataset = duration_builder(self.dataset)
2630

27-
# preprocess
31+
# split partitions
32+
33+
self.dataset = test_train_splitter(self.dataset, random_state=self.random_state)
2834

2935
# set modelling configurations
3036

37+
def run_autopilot(self):
38+
raise NotImplementedError
39+
40+
# preprocess
41+
3142
# train
3243

3344
# eval

python/src/lazylearn/model_selection/__init__.py

Whitespace-only changes.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from models.models import Dataset
2+
from sklearn.model_selection import train_test_split
3+
4+
5+
def test_train_splitter(dataset: Dataset, random_state=None) -> Dataset:
6+
train_partition, test_partition = train_test_split(
7+
dataset.df, test_size=0.2, random_state=random_state
8+
)
9+
10+
dataset.partitions["test"] = test_partition
11+
dataset.partitions["train"] = train_partition
12+
13+
return dataset
14+
15+
16+
def cv_splitter(dataset: Dataset) -> Dataset:
17+
return dataset

python/src/lazylearn/models/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def __init__(
1515
self.column_type_map = column_type_map
1616
self.summary_stats = summary_stats
1717
self.type_collections = type_collections
18+
self.partitions: dict = {}
1819

1920
def save(self):
2021
raise NotImplementedError
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from models.models import Dataset
2+
3+
4+
def duration_builder(dataset: Dataset) -> Dataset:
5+
"""
6+
7+
:param dataset:
8+
:return:
9+
"""
10+
date_cols = dataset.type_collections.get("datetime")
11+
12+
if len(date_cols) > 1:
13+
for i in range(len(date_cols)):
14+
for j in range(i + 1, len(date_cols)):
15+
col_name = f"duration({date_cols[i]}-{date_cols[j]})"
16+
dataset.df[col_name] = (
17+
(dataset.df[date_cols[i]] - dataset.df[date_cols[j]])
18+
.astype("timedelta64[D]")
19+
.astype(int)
20+
)
21+
dataset.column_type_map[col_name] = "numeric"
22+
dataset.type_collections["numeric"].append(col_name)
23+
24+
return dataset

0 commit comments

Comments
 (0)