Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4701577
move a lot of files and functions; remove concept of explict js type;…
Nov 7, 2025
7a49707
move a lot of files and functions; remove concept of explict js type;…
Nov 7, 2025
2a6e17f
move more json files; add a mini version of Metadata class
Nov 7, 2025
336b781
looks like get component requirement is not needed
Nov 7, 2025
810d95d
remove df_utils, import load_df from schema_generation
Nov 7, 2025
589db6b
make sure if setup.cfg changes, the dependencies get reinstalled
Nov 7, 2025
5bff42b
reorganize test and test dirs in the code
Nov 7, 2025
0e216f9
update cmments
Nov 7, 2025
67e8728
add back example model jsonld
Nov 7, 2025
41c5184
rewrite some tests using dmge_column_type
Nov 10, 2025
8a26146
remove unused comment
Nov 10, 2025
71ae547
trim down the helpper function
Nov 10, 2025
4a73fa5
update the reason
Nov 10, 2025
1391983
remove js_type in _get_validation_rule_based_fields
Nov 10, 2025
e96e87f
remove js type in ValidationRule
Nov 10, 2025
699e543
completely remove get_js_type_from_inputted_rules
Nov 10, 2025
09d3c7f
replace node2 with traversal node
Nov 10, 2025
4cf904f
completely remove node2
Nov 10, 2025
626b06e
remove type validation rule name
Nov 10, 2025
c8758f7
abolish jsonschema type
Nov 10, 2025
b6a8340
remove comment
Nov 10, 2025
fe48b08
revert changes to the json schema type
Nov 10, 2025
1872909
updated dmge to use get_validation_rule_based_fields_no_explicit_type
Nov 10, 2025
1d6abfc
delete temporary files generated by unit test
Nov 10, 2025
abb7540
remove unused import
Nov 11, 2025
3ef114f
simply just use the key to v29 like other PRs
Nov 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ curator =
networkx>=2.2.8
dataclasses-json>=0.6.1
rdflib>=6.0.0
jsonschema>=4.23.0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this library is also required for the curator extension. We did the following in the test:

from jsonschema import Draft7Validator
from jsonschema.exceptions import ValidationError



pysftp =
pysftp>=0.2.8,<0.3
Expand Down
203 changes: 203 additions & 0 deletions synapseclient/extensions/curator/df_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import logging
from copy import deepcopy
from time import perf_counter
from typing import Any, Optional, Union

import numpy as np
import pandas as pd
from pandarallel import pandarallel # type: ignore

# pylint:disable=no-name-in-module
from pandas._libs.parsers import STR_NA_VALUES # type: ignore

STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES)

try:
STR_NA_VALUES_FILTERED.remove("None")
except KeyError:
pass

logger = logging.getLogger(__name__)


def read_csv(
path_or_buffer: str,
keep_default_na: bool = False,
encoding: str = "utf8",
**load_args: Any,
) -> pd.DataFrame:
"""
A wrapper around pd.read_csv that filters out "None" from the na_values list.

Args:
path_or_buffer: The path to the file or a buffer containing the file.
keep_default_na: Whether to keep the default na_values list.
encoding: The encoding of the file.
**load_args: Additional arguments to pass to pd.read_csv.

Returns:
pd.DataFrame: The dataframe created from the CSV file or buffer.
"""
na_values = load_args.pop(
"na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None
)
return pd.read_csv( # type: ignore
path_or_buffer,
na_values=na_values,
keep_default_na=keep_default_na,
encoding=encoding,
**load_args,
)


def trim_commas_df(
dataframe: pd.DataFrame,
allow_na_values: Optional[bool] = False,
) -> pd.DataFrame:
"""Removes empty (trailing) columns and empty rows from pandas dataframe (manifest data).

Args:
dataframe: pandas dataframe with data from manifest file.
allow_na_values (bool, optional): If true, allow pd.NA values in the dataframe

Returns:
df: cleaned-up pandas dataframe.
"""
# remove all columns which have substring "Unnamed" in them
dataframe = dataframe.loc[:, ~dataframe.columns.str.contains("^Unnamed")]

# remove all completely empty rows
dataframe = dataframe.dropna(how="all", axis=0)

if allow_na_values is False:
# Fill in nan cells with empty strings
dataframe.fillna("", inplace=True)
return dataframe


def convert_ints(string: str) -> Union[np.int64, bool]:
"""
Lambda function to convert a string to an integer if possible, otherwise returns False
Args:
string: string to attempt conversion to int
Returns:
string converted to type int if possible, otherwise False
"""
if isinstance(string, str) and str.isdigit(string):
return np.int64(string)
return False


def find_and_convert_ints(dataframe: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Find strings that represent integers and convert to type int
Args:
dataframe: dataframe with nulls masked as empty strings
Returns:
ints: dataframe with values that were converted to type int
is_int: dataframe with boolean values indicating which cells were converted to type int

"""
# pylint: disable=unnecessary-lambda
large_manifest_cutoff_size = 1000
# Find integers stored as strings and replace with entries of type np.int64
if (
dataframe.size < large_manifest_cutoff_size
): # If small manifest, iterate as normal for improved performance
ints = dataframe.map( # type:ignore
lambda cell: convert_ints(cell), na_action="ignore"
).fillna(False)

else: # parallelize iterations for large manifests
pandarallel.initialize(verbose=1)
ints = dataframe.parallel_applymap( # type:ignore
lambda cell: convert_ints(cell), na_action="ignore"
).fillna(False)

# Identify cells converted to integers
is_int = ints.map(pd.api.types.is_integer) # type:ignore

assert isinstance(ints, pd.DataFrame)
assert isinstance(is_int, pd.DataFrame)

return ints, is_int


def convert_floats(dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Convert strings that represent floats to type float
Args:
dataframe: dataframe with nulls masked as empty strings
Returns:
float_df: dataframe with values that were converted to type float. Columns are type object
"""
# create a separate copy of the manifest
# before beginning conversions to store float values
float_df = deepcopy(dataframe)

# convert strings to numerical dtype (float) if possible, preserve non-numerical strings
for col in dataframe.columns:
float_df[col] = pd.to_numeric(float_df[col], errors="coerce").astype("object")

# replace values that couldn't be converted to float with the original str values
float_df[col].fillna(dataframe[col][float_df[col].isna()], inplace=True)

return float_df


def load_df(
file_path: str,
preserve_raw_input: bool = True,
data_model: bool = False,
allow_na_values: bool = False,
**load_args: Any,
) -> pd.DataFrame:
"""
Universal function to load CSVs and return DataFrames
Parses string entries to convert as appropriate to type int, float, and pandas timestamp
Pandarallel is used for type inference for large manifests to improve performance

Args:
file_path (str): path of csv to open
preserve_raw_input (bool, optional): If false, convert cell datatypes to an inferred type
data_model (bool, optional): bool, indicates if importing a data model
allow_na_values (bool, optional): If true, allow pd.NA values in the dataframe
**load_args(dict): dict of key value pairs to be passed to the pd.read_csv function

Raises:
ValueError: When pd.read_csv on the file path doesn't return as dataframe

Returns:
pd.DataFrame: a processed dataframe for manifests or unprocessed df for data models and
where indicated
"""
# start performance timer
t_load_df = perf_counter()

# Read CSV to df as type specified in kwargs
org_df = read_csv(file_path, encoding="utf8", **load_args) # type: ignore
if not isinstance(org_df, pd.DataFrame):
raise ValueError(
(
"Pandas did not return a dataframe. "
"Pandas will return a TextFileReader if chunksize parameter is used."
)
)

# only trim if not data model csv
if not data_model:
org_df = trim_commas_df(org_df, allow_na_values=allow_na_values)

if preserve_raw_input:
logger.debug(f"Load Elapsed time {perf_counter()-t_load_df}")
return org_df

ints, is_int = find_and_convert_ints(org_df)

float_df = convert_floats(org_df)

# Store values that were converted to type int in the final dataframe
processed_df = float_df.mask(is_int, other=ints)

logger.debug(f"Load Elapsed time {perf_counter()-t_load_df}")
return processed_df
Loading
Loading