oracle
diff --git a/‎ads/dataset/classification_dataset.py‎
Lines changed: 2 additions & 2 deletions b/‎ads/dataset/classification_dataset.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ads/dataset/dataset.py‎
Lines changed: 45 additions & 4 deletions b/‎ads/dataset/dataset.py‎
Lines changed: 45 additions & 4 deletions
diff --git a/‎ads/dataset/dataset_with_target.py‎
Lines changed: 170 additions & 7 deletions b/‎ads/dataset/dataset_with_target.py‎
Lines changed: 170 additions & 7 deletions
diff --git a/‎ads/dataset/forecasting_dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎ads/dataset/forecasting_dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ads/dataset/helper.py‎
Lines changed: 18 additions & 1 deletion b/‎ads/dataset/helper.py‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎ads/dataset/regression_dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎ads/dataset/regression_dataset.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8; -*-
 
-# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
+# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 import pandas as pd
@@ -22,7 +22,7 @@ class ClassificationDataset(ADSDatasetWithTarget):
 
     def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
         ADSDatasetWithTarget.__init__(
-            self, df, sampled_df, target, target_type, shape, **kwargs
+            self, df=df, sampled_df=sampled_df, target=target, target_type=target_type, shape=shape, **kwargs
         )
 
     def auto_transform(
 
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*--
 
-# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
+# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 from __future__ import print_function, absolute_import, division
@@ -16,7 +16,7 @@
 
 from collections import Counter
 from sklearn.preprocessing import FunctionTransformer
-from typing import Iterable, Union
+from typing import Iterable, Tuple, Union
 
 from ads import set_documentation_mode
 from ads.common import utils
@@ -71,8 +71,8 @@ class ADSDataset(PandasDataset):
     def __init__(
         self,
         df,
-        sampled_df,
-        shape,
+        sampled_df=None,
+        shape=None,
         name="",
         description=None,
         type_discovery=True,
@@ -88,6 +88,17 @@ def __init__(
         # to keep performance high and linear no matter the size of the distributed dataset we
         # create a pandas df that's used internally because this has a fixed upper size.
         #
+        if shape is None:
+            shape = df.shape
+
+        if sampled_df is None:
+            sampled_df = generate_sample(
+                df,
+                shape[0],
+                DatasetDefaults.sampling_confidence_level,
+                DatasetDefaults.sampling_confidence_interval,
+                **kwargs,
+            )
         super().__init__(
             sampled_df,
             type_discovery=type_discovery,
@@ -134,6 +145,36 @@ def __repr__(self):
     def __len__(self):
         return self.shape[0]
 
+    @staticmethod
+    def from_dataframe(
+        df,
+        sampled_df=None,
+        shape=None,
+        name="",
+        description=None,
+        type_discovery=True,
+        types={},
+        metadata=None,
+        progress=DummyProgressBar(),
+        transformer_pipeline=None,
+        interactive=False,
+        **kwargs,
+    ) -> "ADSDataset":
+        return ADSDataset(
+            df=df,
+            sampled_df=sampled_df,
+            shape=shape,
+            name=name,
+            description=description,
+            type_discovery=type_discovery,
+            types=types,
+            metadata=metadata,
+            progress=progress,
+            transformer_pipeline=transformer_pipeline,
+            interactive=interactive,
+            **kwargs
+        )
+
     @property
     @deprecated(
         "2.5.2", details="The ddf attribute is deprecated. Use the df attribute."
 
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8; -*-
 
-# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
+# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 from __future__ import absolute_import, print_function
@@ -10,7 +10,7 @@
 import importlib
 from collections import defaultdict
 from numbers import Number
-from typing import Union
+from typing import Tuple, Union
 
 import pandas as pd
 from ads.common import utils, logger
@@ -23,14 +23,30 @@
 from ads.dataset.dataset import ADSDataset
 from ads.dataset.feature_engineering_transformer import FeatureEngineeringTransformer
 from ads.dataset.feature_selection import FeatureImportance
-from ads.dataset.helper import deprecate_default_value, deprecate_variable
+from ads.dataset.helper import (
+    DatasetDefaults,
+    deprecate_default_value, 
+    deprecate_variable, 
+    generate_sample,
+    get_target_type,
+    is_text_data,
+)
 from ads.dataset.label_encoder import DataFrameLabelEncoder
 from ads.dataset.pipeline import TransformerPipeline
 from ads.dataset.progress import DummyProgressBar
 from ads.dataset.recommendation import Recommendation
 from ads.dataset.recommendation_transformer import RecommendationTransformer
 from ads.dataset.target import TargetVariable
-from ads.type_discovery.typed_feature import DateTimeTypedFeature
+from ads.type_discovery.typed_feature import (
+    CategoricalTypedFeature,
+    ContinuousTypedFeature,
+    DocumentTypedFeature,
+    GISTypedFeature,
+    OrdinalTypedFeature,
+    TypedFeature,
+    DateTimeTypedFeature, 
+    TypedFeature
+)
 from sklearn.model_selection import train_test_split
 from pandas.io.formats.printing import pprint_thing
 from sklearn.preprocessing import FunctionTransformer
@@ -45,10 +61,10 @@ class ADSDatasetWithTarget(ADSDataset, metaclass=ABCMeta):
     def __init__(
         self,
         df,
-        sampled_df,
         target,
-        target_type,
-        shape,
+        sampled_df=None,
+        shape=None,
+        target_type=None,
         sample_max_rows=-1,
         type_discovery=True,
         types={},
@@ -61,6 +77,16 @@ def __init__(
         **kwargs,
     ):
         self.recommendation_transformer = None
+        if shape is None:
+            shape = df.shape
+        if sampled_df is None:
+            sampled_df = generate_sample(
+                df,
+                shape[0],
+                DatasetDefaults.sampling_confidence_level,
+                DatasetDefaults.sampling_confidence_interval,
+                **kwargs,
+            )
 
         if parent is None:
             cols = sampled_df.columns.tolist()
@@ -135,6 +161,8 @@ def __init__(
             cols.insert(0, cols.pop(cols.index(target)))
             self.sampled_df = self.sampled_df[[*cols]]
 
+        if target_type is None:
+            target_type = get_target_type(target, sampled_df, **kwargs)
         self.target = TargetVariable(self, target, target_type)
 
         # remove target from type discovery conversion
@@ -145,6 +173,141 @@ def __init__(
             ):
                 step[1].kw_args["dtypes"].pop(self.target.name)
 
+    @staticmethod
+    def from_dataframe(
+        df: pd.DataFrame,
+        target: str,
+        sampled_df: pd.DataFrame = None,
+        shape: Tuple[int, int] = None,
+        target_type: TypedFeature = None,
+        positive_class=None,
+        **init_kwargs,
+    ):
+        from ads.dataset.classification_dataset import (
+            BinaryClassificationDataset, 
+            BinaryTextClassificationDataset, 
+            MultiClassClassificationDataset, 
+            MultiClassTextClassificationDataset
+        )
+        from ads.dataset.forecasting_dataset import ForecastingDataset
+        from ads.dataset.regression_dataset import RegressionDataset
+
+        if sampled_df is None:
+            sampled_df = generate_sample(
+                df,
+                (shape or df.shape)[0],
+                DatasetDefaults.sampling_confidence_level,
+                DatasetDefaults.sampling_confidence_interval,
+                **init_kwargs,
+            )
+            
+        if target_type is None:
+            target_type = get_target_type(target, sampled_df, **init_kwargs)
+
+        if len(df[target].dropna()) == 0:
+            logger.warning(
+                "It is not recommended to use an empty column as the target variable."
+            )
+            raise ValueError(
+                f"We do not support using empty columns as the chosen target"
+            )
+        if utils.is_same_class(target_type, ContinuousTypedFeature):
+            return RegressionDataset(
+                df=df,
+                sampled_df=sampled_df,
+                target=target,
+                target_type=target_type,
+                shape=shape,
+                **init_kwargs,
+            )
+        elif utils.is_same_class(
+            target_type, DateTimeTypedFeature
+        ) or df.index.dtype.name.startswith("datetime"):
+            return ForecastingDataset(
+                df=df,
+                sampled_df=sampled_df,
+                target=target,
+                target_type=target_type,
+                shape=shape,
+                **init_kwargs,
+            )
+
+        # Adding ordinal typed feature, but ultimately we should rethink how we want to model this type
+        elif utils.is_same_class(target_type, CategoricalTypedFeature) or utils.is_same_class(
+            target_type, OrdinalTypedFeature
+        ):
+            if target_type.meta_data["internal"]["unique"] == 2:
+                if is_text_data(sampled_df, target):
+                    return BinaryTextClassificationDataset(
+                        df=df,
+                        sampled_df=sampled_df,
+                        target=target,
+                        shape=shape,
+                        target_type=target_type,
+                        positive_class=positive_class,
+                        **init_kwargs,
+                    )
+
+                return BinaryClassificationDataset(
+                    df=df,
+                    sampled_df=sampled_df,
+                    target=target,
+                    shape=shape,
+                    target_type=target_type,
+                    positive_class=positive_class,
+                    **init_kwargs,
+                )
+            else:
+                if is_text_data(sampled_df, target):
+                    return MultiClassTextClassificationDataset(
+                        df=df,
+                        sampled_df=sampled_df,
+                        target=target,
+                        target_type=target_type,
+                        shape=shape,
+                        **init_kwargs,
+                    )
+                return MultiClassClassificationDataset(
+                    df=df,
+                    sampled_df=sampled_df,
+                    target=target,
+                    target_type=target_type,
+                    shape=shape,
+                    **init_kwargs,
+                )
+        elif (
+            utils.is_same_class(target, DocumentTypedFeature)
+            or "text" in target_type["type"]
+            or "text" in target
+        ):
+            raise ValueError(
+                f"The column {target} cannot be used as the target column."
+            )
+        elif (
+            utils.is_same_class(target_type, GISTypedFeature)
+            or "coord" in target_type["type"]
+            or "coord" in target
+        ):
+            raise ValueError(
+                f"The column {target} cannot be used as the target column."
+            )
+        # This is to catch constant columns that are boolean. Added as a fix for pd.isnull(), and datasets with a
+        #   binary target, but only data on one instance
+        elif target_type and target_type["low_level_type"] == "bool":
+            return BinaryClassificationDataset(
+                df=df,
+                sampled_df=sampled_df,
+                target=target,
+                shape=shape,
+                target_type=target_type,
+                positive_class=positive_class,
+                **init_kwargs,
+            )
+        raise ValueError(
+            f"Unable to identify problem type. Specify the data type of {target} using 'types'. "
+            f"For example, types = {{{target}: 'category'}}"
+        )
+
     def rename_columns(self, columns):
         """
         Returns a dataset with columns renamed.
 
@@ -14,7 +14,7 @@ def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
         if isinstance(target, DateTimeTypedFeature):
             df = df.set_index(target)
         ADSDatasetWithTarget.__init__(
-            self, df, sampled_df, target, target_type, shape, **kwargs
+            self, df=df, sampled_df=sampled_df, target=target, target_type=target_type, shape=shape, **kwargs
         )
 
     def select_best_features(self, score_func=None, k=12):
 
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8; -*-
 
-# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
+# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 import ast
@@ -832,3 +832,20 @@ def _log_yscale_not_set():
     logger.info(
         "`yscale` parameter is not set. Valid values are `'linear'`, `'log'`, `'symlog'`."
     )
+
+def infer_target_type(target, target_series, discover_target_type=True):
+    # if type discovery is turned off, infer type from pandas dtype
+    if discover_target_type:
+        target_type = TypeDiscoveryDriver().discover(
+            target, target_series, is_target=True
+        )
+    else:
+        target_type = get_feature_type(target, target_series)
+    return target_type
+
+def get_target_type(target, sampled_df, **init_kwargs):
+    discover_target_type = init_kwargs.get("type_discovery", True)
+    if target in init_kwargs.get("types", {}):
+        sampled_df[target] = sampled_df[target].astype(init_kwargs.get("types")[target])
+        discover_target_type = False
+    return infer_target_type(target, sampled_df[target], discover_target_type)
@@ -10,5 +10,5 @@
 class RegressionDataset(ADSDatasetWithTarget):
     def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
         ADSDatasetWithTarget.__init__(
-            self, df, sampled_df, target, target_type, shape, **kwargs
+            self, df=df, sampled_df=sampled_df, target=target, target_type=target_type, shape=shape, **kwargs
         )
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):`
`14`	`14`	`if isinstance(target, DateTimeTypedFeature):`
`15`	`15`	`df = df.set_index(target)`
`16`	`16`	`ADSDatasetWithTarget.__init__(`
`17`		`- self, df, sampled_df, target, target_type, shape, **kwargs`
	`17`	`+ self, df=df, sampled_df=sampled_df, target=target, target_type=target_type, shape=shape, **kwargs`
`18`	`18`	`)`
`19`	`19`
`20`	`20`	`def select_best_features(self, score_func=None, k=12):`
Original file line number	Diff line number	Diff line change
`@@ -10,5 +10,5 @@`
`10`	`10`	`class RegressionDataset(ADSDatasetWithTarget):`
`11`	`11`	`def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):`
`12`	`12`	`ADSDatasetWithTarget.__init__(`
`13`		`- self, df, sampled_df, target, target_type, shape, **kwargs`
	`13`	`+ self, df=df, sampled_df=sampled_df, target=target, target_type=target_type, shape=shape, **kwargs`
`14`	`14`	`)`