@@ -321,7 +321,6 @@ def make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref):
321321 and args ["y" ]
322322 and len (trace_data [[args ["x" ], args ["y" ]]].dropna ()) > 1
323323 ):
324-
325324 # sorting is bad but trace_specs with "trendline" have no other attrs
326325 sorted_trace_data = trace_data .sort_values (by = args ["x" ])
327326 y = sorted_trace_data [args ["y" ]].values
@@ -562,7 +561,6 @@ def set_cartesian_axis_opts(args, axis, letter, orders):
562561
563562
564563def configure_cartesian_marginal_axes (args , fig , orders ):
565-
566564 if "histogram" in [args ["marginal_x" ], args ["marginal_y" ]]:
567565 fig .layout ["barmode" ] = "overlay"
568566
@@ -885,8 +883,8 @@ def make_trace_spec(args, constructor, attrs, trace_patch):
885883def make_trendline_spec (args , constructor ):
886884 trace_spec = TraceSpec (
887885 constructor = go .Scattergl
888- if constructor == go .Scattergl # could be contour
889- else go .Scatter ,
886+ if constructor == go .Scattergl
887+ else go .Scatter , # could be contour
890888 attrs = ["trendline" ],
891889 trace_patch = dict (mode = "lines" ),
892890 marginal = None ,
@@ -1064,14 +1062,25 @@ def _escape_col_name(df_input, col_name, extra):
10641062 return col_name
10651063
10661064
1067- def to_unindexed_series (x ):
1065+ def to_unindexed_series (x , name = None ):
10681066 """
1069- assuming x is list-like or even an existing pd.Series, return a new pd.Series with
1070- no index, without extracting the data from an existing Series via numpy, which
1067+ assuming x is list-like or even an existing pd.Series, return a new pd.DataFrame
1068+ with no index, without extracting the data from an existing Series via numpy, which
10711069 seems to mangle datetime columns. Stripping the index from existing pd.Series is
1072- required to get things to match up right in the new DataFrame we're building
1070+ required to get things to match up right in the new DataFrame we're building.
1071+ It's converted to a frame so that it can be concated easily and it contains
1072+ `columns` attribute, so `_get_cols` can be used.
10731073 """
1074- return pd .Series (x ).reset_index (drop = True )
1074+ return pd .Series (x , name = name ).reset_index (drop = True ).to_frame ()
1075+
1076+
1077+ def _get_cols (df_list ):
1078+ """
1079+ get all the columns in the current df_list.
1080+ Since this func is called when we raise error, the func is called once.
1081+ So inefficiency here can be tolerated.
1082+ """
1083+ return [column for df in df_list for column in df .columns ]
10751084
10761085
10771086def process_args_into_dataframe (args , wide_mode , var_name , value_name ):
@@ -1086,9 +1095,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
10861095 df_input = args ["data_frame" ]
10871096 df_provided = df_input is not None
10881097
1089- df_output = pd .DataFrame ()
1090- constants = dict ()
1091- ranges = list ()
1098+ # we use append it as list to avoid performance issues in pandas
1099+ # when dealing with large dataframes.
1100+ df_outputs = []
1101+ constants = {}
1102+ ranges = []
10921103 wide_id_vars = set ()
10931104 reserved_names = _get_reserved_col_names (args ) if df_provided else set ()
10941105
@@ -1099,7 +1110,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
10991110 "No data were provided. Please provide data either with the `data_frame` or with the `dimensions` argument."
11001111 )
11011112 else :
1102- df_output [ df_input . columns ] = df_input [df_input .columns ]
1113+ df_outputs . append ( df_input [df_input .columns ])
11031114
11041115 # hover_data is a dict
11051116 hover_data_is_dict = (
@@ -1140,7 +1151,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
11401151 # argument_list and field_list ready, iterate over them
11411152 # Core of the loop starts here
11421153 for i , (argument , field ) in enumerate (zip (argument_list , field_list )):
1143- length = len (df_output )
1154+ length = len (df_outputs [ 0 ]) if len ( df_outputs ) else 0
11441155 if argument is None :
11451156 continue
11461157 col_name = None
@@ -1181,11 +1192,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
11811192 % (
11821193 argument ,
11831194 len (real_argument ),
1184- str (list ( df_output . columns )),
1195+ str (_get_cols ( df_outputs )),
11851196 length ,
11861197 )
11871198 )
1188- df_output [ col_name ] = to_unindexed_series (real_argument )
1199+ df_outputs . append ( to_unindexed_series (real_argument , col_name ) )
11891200 elif not df_provided :
11901201 raise ValueError (
11911202 "String or int arguments are only possible when a "
@@ -1214,13 +1225,13 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
12141225 % (
12151226 field ,
12161227 len (df_input [argument ]),
1217- str (list ( df_output . columns )),
1228+ str (_get_cols ( df_outputs )),
12181229 length ,
12191230 )
12201231 )
12211232 else :
12221233 col_name = str (argument )
1223- df_output [ col_name ] = to_unindexed_series (df_input [argument ])
1234+ df_outputs . append ( to_unindexed_series (df_input [argument ], col_name ) )
12241235 # ----------------- argument is likely a column / array / list.... -------
12251236 else :
12261237 if df_provided and hasattr (argument , "name" ):
@@ -1247,9 +1258,9 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
12471258 "All arguments should have the same length. "
12481259 "The length of argument `%s` is %d, whereas the "
12491260 "length of previously-processed arguments %s is %d"
1250- % (field , len (argument ), str (list ( df_output . columns )), length )
1261+ % (field , len (argument ), str (_get_cols ( df_outputs )), length )
12511262 )
1252- df_output [ str ( col_name )] = to_unindexed_series (argument )
1263+ df_outputs . append ( to_unindexed_series (argument , str ( col_name )) )
12531264
12541265 # Finally, update argument with column name now that column exists
12551266 assert col_name is not None , (
@@ -1267,12 +1278,14 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
12671278 if field_name != "wide_variable" :
12681279 wide_id_vars .add (str (col_name ))
12691280
1270- for col_name in ranges :
1271- df_output [ col_name ] = range ( len ( df_output ) )
1281+ length = len ( df_outputs [ 0 ])
1282+ df_outputs . extend ([ pd . Series ( range ( length ), name = col_name ) for col_name in ranges ] )
12721283
1273- for col_name in constants :
1274- df_output [col_name ] = constants [col_name ]
1284+ df_outputs .extend (
1285+ [pd .Series (constants [col_name ], name = col_name ) for col_name in constants ]
1286+ )
12751287
1288+ df_output = pd .concat (df_outputs , axis = 1 )
12761289 return df_output , wide_id_vars
12771290
12781291
0 commit comments