From f6bdf935a1da68353176f65654242691212b0c04 Mon Sep 17 00:00:00 2001 From: Fidorc80 <114183964+Fidorc80@users.noreply.github.com> Date: Mon, 24 Nov 2025 08:41:01 -0800 Subject: [PATCH 1/4] DOC: inline docstrings for read_excel and storage options in io/excel/base.py. --- pandas/io/excel/_base.py | 602 +++++++++++++++++++++------------------ 1 file changed, 321 insertions(+), 281 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a171b1229f7bb..ab17865fe85e5 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -36,11 +36,7 @@ import_optional_dependency, ) from pandas.errors import EmptyDataError -from pandas.util._decorators import ( - Appender, - doc, - set_module, -) +from pandas.util._decorators import (set_module) from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -88,274 +84,6 @@ StorageOptions, WriteExcelBuffer, ) -_read_excel_doc = ( - """ -Read an Excel file into a ``DataFrame``. - -Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions -read from a local filesystem or URL. Supports an option to read -a single sheet or a list of sheets. - -Parameters ----------- -io : str, ExcelFile, xlrd.Book, path object, or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. A local file could be: ``file://localhost/path/to/table.xlsx``. - - If you want to pass in a path object, pandas accepts any ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) - or ``StringIO``. - - .. deprecated:: 2.1.0 - Passing byte strings is deprecated. To read from a - byte string, wrap it in a ``BytesIO`` object. -sheet_name : str, int, list, or None, default 0 - Strings are used for sheet names. Integers are used in zero-indexed - sheet positions (chart sheets do not count as a sheet position). - Lists of strings/integers are used to request multiple sheets. - When ``None``, will return a dictionary containing DataFrames for each sheet. - - Available cases: - - * Defaults to ``0``: 1st sheet as a `DataFrame` - * ``1``: 2nd sheet as a `DataFrame` - * ``"Sheet1"``: Load sheet with name "Sheet1" - * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" - as a dict of `DataFrame` - * ``None``: Returns a dictionary containing DataFrames for each sheet.. - -header : int, list of int, default 0 - Row (0-indexed) to use for the column labels of the parsed - DataFrame. If a list of integers is passed those row positions will - be combined into a ``MultiIndex``. Use None if there is no header. -names : array-like, default None - List of column names to use. If file contains no header row, - then you should explicitly pass header=None. -index_col : int, str, list of int, default None - Column (0-indexed) to use as the row labels of the DataFrame. - Pass None if there is no such column. If a list is passed, - those columns will be combined into a ``MultiIndex``. If a - subset of data is selected with ``usecols``, index_col - is based on the subset. - - Missing values will be forward filled to allow roundtripping with - ``to_excel`` for ``merged_cells=True``. To avoid forward filling the - missing values use ``set_index`` after reading the data instead of - ``index_col``. -usecols : str, list-like, or callable, default None - * If None, then parse all columns. - * If str, then indicates comma separated list of Excel column letters - and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of - both sides. - * If list of int, then indicates list of column numbers to be parsed - (0-indexed). - * If list of string, then indicates list of column names to be parsed. - * If callable, then evaluate each column name against it and parse the - column if the callable returns ``True``. - - Returns a subset of the columns according to behavior above. -dtype : Type name or dict of column -> type, default None - Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} - Use ``object`` to preserve data as stored in Excel and not interpret dtype, - which will necessarily result in ``object`` dtype. - If converters are specified, they will be applied INSTEAD - of dtype conversion. - If you use ``None``, it will infer the dtype of each column based on the data. -engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None - If io is not a buffer or path, this must be set to identify io. - Engine compatibility : - - - ``openpyxl`` supports newer Excel file formats. - - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) - and OpenDocument (.ods) file formats. - - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - - ``pyxlsb`` supports Binary Excel files. - - ``xlrd`` supports old-style Excel files (.xls). - - When ``engine=None``, the following logic will be used to determine the engine: - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. - - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. - - Otherwise ``openpyxl`` will be used. -converters : dict, default None - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the Excel cell content, and return the transformed - content. -true_values : list, default None - Values to consider as True. -false_values : list, default None - Values to consider as False. -skiprows : list-like, int, or callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (int) at the - start of the file. If callable, the callable function will be evaluated - against the row indices, returning True if the row should be skipped and - False otherwise. An example of a valid callable argument would be ``lambda - x: x in [0, 2]``. -nrows : int, default None - Number of rows to parse. Does not include header rows. -na_values : scalar, str, list-like, or dict, default None - Additional strings to recognize as NA/NaN. If dict passed, specific - per-column NA values. By default the following values are interpreted - as NaN: '""" - + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") - + """'. -keep_default_na : bool, default True - Whether or not to include the default NaN values when parsing the data. - Depending on whether ``na_values`` is passed in, the behavior is as follows: - - * If ``keep_default_na`` is True, and ``na_values`` are specified, - ``na_values`` is appended to the default NaN values used for parsing. - * If ``keep_default_na`` is True, and ``na_values`` are not specified, only - the default NaN values are used for parsing. - * If ``keep_default_na`` is False, and ``na_values`` are specified, only - the NaN values specified ``na_values`` are used for parsing. - * If ``keep_default_na`` is False, and ``na_values`` are not specified, no - strings will be parsed as NaN. - - Note that if `na_filter` is passed in as False, the ``keep_default_na`` and - ``na_values`` parameters will be ignored. -na_filter : bool, default True - Detect missing value markers (empty strings and the value of na_values). In - data without any NAs, passing ``na_filter=False`` can improve the - performance of reading a large file. -verbose : bool, default False - Indicate number of NA values placed in non-numeric columns. -parse_dates : bool, list-like, or dict, default False - The behavior is as follows: - - * ``bool``. If True -> try parsing the index. - * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 - each as a separate date column. - * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as - a single date column. - * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call - result 'foo' - - If a column or index contains an unparsable date, the entire column or - index will be returned unaltered as an object data type. If you don`t want to - parse some cells as date just change their type in Excel to "Text". - For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. - - Note: A fast-path exists for iso8601-formatted dates. -date_format : str or dict of column -> format, default ``None`` - If used in conjunction with ``parse_dates``, will parse dates according to this - format. For anything more complex, - please read in as ``object`` and then apply :func:`to_datetime` as-needed. - - .. versionadded:: 2.0.0 -thousands : str, default None - Thousands separator for parsing string columns to numeric. Note that - this parameter is only necessary for columns stored as TEXT in Excel, - any numeric columns will automatically be parsed, regardless of display - format. -decimal : str, default '.' - Character to recognize as decimal point for parsing string columns to numeric. - Note that this parameter is only necessary for columns stored as TEXT in Excel, - any numeric columns will automatically be parsed, regardless of display - format.(e.g. use ',' for European data). -comment : str, default None - Comments out remainder of line. Pass a character or characters to this - argument to indicate comments in the input file. Any data between the - comment string and the end of the current line is ignored. -skipfooter : int, default 0 - Rows at the end to skip (0-indexed). -{storage_options} - -dtype_backend : {{'numpy_nullable', 'pyarrow'}} - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior - is to not use nullable data types. If specified, the behavior - is as follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - * ``"pyarrow"``: returns pyarrow-backed nullable - :class:`ArrowDtype` :class:`DataFrame` - - .. versionadded:: 2.0 - -engine_kwargs : dict, optional - Arbitrary keyword arguments passed to excel engine. - -Returns -------- -DataFrame or dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheet_name - argument for more information on when a dict of DataFrames is returned. - -See Also --------- -DataFrame.to_excel : Write DataFrame to an Excel file. -DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. -read_csv : Read a comma-separated values (csv) file into DataFrame. -read_fwf : Read a table of fixed-width formatted lines into DataFrame. - -Notes ------ -For specific information on the methods used for each Excel engine, refer to the pandas -:ref:`user guide ` - -Examples --------- -The file can be read using the file name as string or an open file object: - ->>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP - Name Value -0 string1 1 -1 string2 2 -2 #Comment 3 - ->>> pd.read_excel(open('tmp.xlsx', 'rb'), -... sheet_name='Sheet3') # doctest: +SKIP - Unnamed: 0 Name Value -0 0 string1 1 -1 1 string2 2 -2 2 #Comment 3 - -Index and header can be specified via the `index_col` and `header` arguments - ->>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP - 0 1 2 -0 NaN Name Value -1 0.0 string1 1 -2 1.0 string2 2 -3 2.0 #Comment 3 - -Column types are inferred but can be explicitly specified - ->>> pd.read_excel('tmp.xlsx', index_col=0, -... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP - Name Value -0 string1 1.0 -1 string2 2.0 -2 #Comment 3.0 - -True, False, and NA values, and thousands separators have defaults, -but can be explicitly specified, too. Supply the values you would like -as strings or lists of strings! - ->>> pd.read_excel('tmp.xlsx', index_col=0, -... na_values=['string1', 'string2']) # doctest: +SKIP - Name Value -0 NaN 1 -1 NaN 2 -2 #Comment 3 - -Comment lines in the excel input file can be skipped using the -``comment`` kwarg. - ->>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP - Name Value -0 string1 1.0 -1 string2 2.0 -2 None NaN -""" -) @overload @@ -433,8 +161,6 @@ def read_excel( @set_module("pandas") -@doc(storage_options=_shared_docs["storage_options"]) -@Appender(_read_excel_doc) def read_excel( io, sheet_name: str | int | list[IntStrT] | None = 0, @@ -469,6 +195,299 @@ def read_excel( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, engine_kwargs: dict | None = None, ) -> DataFrame | dict[IntStrT, DataFrame]: + """ + Read an Excel file into a ``pandas`` ``DataFrame``. + + Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions + read from a local filesystem or URL. Supports an option to read + a single sheet or a list of sheets. + + Parameters + ---------- + io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.xlsx``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handle (e.g. via builtin ``open`` function) + or ``StringIO``. + + .. deprecated:: 2.1.0 + Passing byte strings is deprecated. To read from a + byte string, wrap it in a ``BytesIO`` object. + sheet_name : str, int, list, or None, default 0 + Strings are used for sheet names. Integers are used in zero-indexed + sheet positions (chart sheets do not count as a sheet position). + Lists of strings/integers are used to request multiple sheets. + Specify ``None`` to get all worksheets. + + Available cases: + + * Defaults to ``0``: 1st sheet as a `DataFrame` + * ``1``: 2nd sheet as a `DataFrame` + * ``"Sheet1"``: Load sheet with name "Sheet1" + * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" + as a dict of `DataFrame` + * ``None``: All worksheets. + + header : int, list of int, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex``. Use None if there is no header. + names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None. + index_col : int, str, list of int, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex``. If a + subset of data is selected with ``usecols``, index_col + is based on the subset. + + Missing values will be forward filled to allow roundtripping with + ``to_excel`` for ``merged_cells=True``. To avoid forward filling the + missing values use ``set_index`` after reading the data instead of + ``index_col``. + usecols : str, list-like, or callable, default None + * If None, then parse all columns. + * If str, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of + both sides. + * If list of int, then indicates list of column numbers to be parsed + (0-indexed). + * If list of string, then indicates list of column names to be parsed. + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + Returns a subset of the columns according to behavior above. + dtype : Type name or dict of column -> type, default None + Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} + Use ``object`` to preserve data as stored in Excel and not interpret dtype, + which will necessarily result in ``object`` dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + If you use ``None``, it will infer the dtype of each column based on the data. + engine : {'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}, default None + If io is not a buffer or path, this must be set to identify io. + Engine compatibility : + + - ``openpyxl`` supports newer Excel file formats. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. + - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``pyxlsb`` supports Binary Excel files. + - ``xlrd`` supports old-style Excel files (.xls). + + When ``engine=None``, the following logic will be used to determine the engine: + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. + true_values : list, default None + Values to consider as True. + false_values : list, default None + Values to consider as False. + skiprows : list-like, int, or callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (int) at the + start of the file. If callable, the callable function will be evaluated + against the row indices, returning True if the row should be skipped and + False otherwise. An example of a valid callable argument would be ``lambda + x: x in [0, 2]``. + nrows : int, default None + Number of rows to parse. + na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted + as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', + '1.#IND', '1.#QNAN', '', 'N/A', 'NA', 'NULL', 'NaN', 'None', + 'n/a', 'nan', 'null'. + keep_default_na : bool, default True + Whether or not to include the default NaN values when parsing the data. + Depending on whether ``na_values`` is passed in, the behavior is as follows: + + * If ``keep_default_na`` is True, and ``na_values`` are specified, + ``na_values`` is appended to the default NaN values used for parsing. + * If ``keep_default_na`` is True, and ``na_values`` are not specified, only + the default NaN values are used for parsing. + * If ``keep_default_na`` is False, and ``na_values`` are specified, only + the NaN values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is False, and ``na_values`` are not specified, no + strings will be parsed as NaN. + + Note that if `na_filter` is passed in as False, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. + na_filter : bool, default True + Detect missing value markers (empty strings and the value of na_values). In + data without any NAs, passing ``na_filter=False`` can improve the + performance of reading a large file. + verbose : bool, default False + Indicate number of NA values placed in non-numeric columns. + parse_dates : bool, list-like, or dict, default False + The behavior is as follows: + + * ``bool``. If True -> try parsing the index. + * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * ``dict``, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index contains an unparsable date, the entire column or + index will be returned unaltered as an object data type. If you don`t want to + parse some cells as date just change their type in Excel to "Text". + For non-standard datetime parsing, use ``pd.to_datetime`` after + ``pd.read_excel``. + + Note: A fast-path exists for iso8601-formatted dates. + date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. + + .. deprecated:: 2.0.0 + Use ``date_format`` instead, or read in as ``object`` and then apply + :func:`to_datetime` as-needed. + date_format : str or dict of column -> format, default ``None`` + If used in conjunction with ``parse_dates``, will parse dates according to this + format. For anything more complex, + please read in as ``object`` and then apply :func:`to_datetime` as-needed. + + .. versionadded:: 2.0.0 + thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. + decimal : str, default '.' + Character to recognize as decimal point for parsing string columns to numeric. + Note that this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format.(e.g. use ',' for European data). + + .. versionadded:: 1.4.0 + + comment : str, default None + Comments out remainder of line. Pass a character or characters to this + argument to indicate comments in the input file. Any data between the + comment string and the end of the current line is ignored. + skipfooter : int, default 0 + Rows at the end to skip (0-indexed). + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. + + Returns + ------- + DataFrame or dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheet_name + argument for more information on when a dict of DataFrames is returned. + + See Also + -------- + DataFrame.to_excel : Write DataFrame to an Excel file. + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_fwf : Read a table of fixed-width formatted lines into DataFrame. + + Notes + ----- + For specific information on the methods used for each Excel engine, refer to the + pandas + :ref:`user guide ` + + Examples + -------- + The file can be read using the file name as string or an open file object: + + >>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP + Name Value + 0 string1 1 + 1 string2 2 + 2 #Comment 3 + + >>> pd.read_excel(open('tmp.xlsx', 'rb'), + ... sheet_name='Sheet3') # doctest: +SKIP + Unnamed: 0 Name Value + 0 0 string1 1 + 1 1 string2 2 + 2 2 #Comment 3 + + Index and header can be specified via the `index_col` and `header` arguments + + >>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP + 0 1 2 + 0 NaN Name Value + 1 0.0 string1 1 + 2 1.0 string2 2 + 3 2.0 #Comment 3 + + Column types are inferred but can be explicitly specified + + >>> pd.read_excel('tmp.xlsx', index_col=0, + ... dtype={'Name': str, 'Value': float}) # doctest: +SKIP + Name Value + 0 string1 1.0 + 1 string2 2.0 + 2 #Comment 3.0 + + True, False, and NA values, and thousands separators have defaults, + but can be explicitly specified, too. Supply the values you would like + as strings or lists of strings! + + >>> pd.read_excel('tmp.xlsx', index_col=0, + ... na_values=['string1', 'string2']) # doctest: +SKIP + Name Value + 0 NaN 1 + 1 NaN 2 + 2 #Comment 3 + + Comment lines in the excel input file can be skipped using the + ``comment`` kwarg. + + >>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP + Name Value + 0 string1 1.0 + 1 string2 2.0 + 2 None NaN + """ + check_dtype_backend(dtype_backend) should_close = False if engine_kwargs is None: @@ -951,7 +970,6 @@ def _parse_sheet( @set_module("pandas") -@doc(storage_options=_shared_docs["storage_options"]) class ExcelWriter(Generic[_WorkbookT]): """ Class for writing DataFrame objects into excel sheets. @@ -982,7 +1000,15 @@ class ExcelWriter(Generic[_WorkbookT]): (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {{'w', 'a'}}, default 'w' File mode to use (write or append). Append does not work with fsspec URLs. - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error' How to behave when trying to write to a sheet that already @@ -1405,7 +1431,6 @@ def close(self) -> None: PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,))) -@doc(storage_options=_shared_docs["storage_options"]) def inspect_excel_format( content_or_path: FilePath | ReadBuffer[bytes], storage_options: StorageOptions | None = None, @@ -1419,7 +1444,15 @@ def inspect_excel_format( ---------- content_or_path : str or file-like object Path to file or content of file to inspect. May be a URL. - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. Returns ------- @@ -1467,7 +1500,6 @@ def inspect_excel_format( @set_module("pandas") -@doc(storage_options=_shared_docs["storage_options"]) class ExcelFile: """ Class for parsing tabular Excel sheets into DataFrame objects. @@ -1511,7 +1543,15 @@ class ExcelFile: Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. This is not supported, switch to using ``openpyxl`` instead. - {storage_options} + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. engine_kwargs : dict, optional Arbitrary keyword arguments passed to excel engine. From 5ddcd0696a943b22e36dce1d6474021d66faec58 Mon Sep 17 00:00:00 2001 From: Fidorc80 <114183964+Fidorc80@users.noreply.github.com> Date: Mon, 24 Nov 2025 09:34:45 -0800 Subject: [PATCH 2/4] DOC: inline docstrings for read_excel and storage options in io/excel/base.py. with Ruff formatting fixes --- pandas/io/excel/_base.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ab17865fe85e5..a047f2d5f73f6 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -11,7 +11,6 @@ from decimal import Decimal from functools import partial import os -from textwrap import fill from typing import ( IO, TYPE_CHECKING, @@ -30,13 +29,12 @@ from pandas._config import config from pandas._libs import lib -from pandas._libs.parsers import STR_NA_VALUES from pandas.compat._optional import ( get_version, import_optional_dependency, ) from pandas.errors import EmptyDataError -from pandas.util._decorators import (set_module) +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -50,7 +48,6 @@ ) from pandas.core.frame import DataFrame -from pandas.core.shared_docs import _shared_docs from pandas.util.version import Version from pandas.io.common import ( @@ -254,13 +251,13 @@ def read_excel( usecols : str, list-like, or callable, default None * If None, then parse all columns. * If str, then indicates comma separated list of Excel column letters - and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of - both sides. + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of + both sides. * If list of int, then indicates list of column numbers to be parsed (0-indexed). * If list of string, then indicates list of column names to be parsed. * If callable, then evaluate each column name against it and parse the - column if the callable returns ``True``. + column if the callable returns ``True``. Returns a subset of the columns according to behavior above. dtype : Type name or dict of column -> type, default None @@ -318,11 +315,11 @@ def read_excel( * If ``keep_default_na`` is True, and ``na_values`` are specified, ``na_values`` is appended to the default NaN values used for parsing. * If ``keep_default_na`` is True, and ``na_values`` are not specified, only - the default NaN values are used for parsing. + the default NaN values are used for parsing. * If ``keep_default_na`` is False, and ``na_values`` are specified, only - the NaN values specified ``na_values`` are used for parsing. + the NaN values specified ``na_values`` are used for parsing. * If ``keep_default_na`` is False, and ``na_values`` are not specified, no - strings will be parsed as NaN. + strings will be parsed as NaN. Note that if `na_filter` is passed in as False, the ``keep_default_na`` and ``na_values`` parameters will be ignored. @@ -436,14 +433,13 @@ def read_excel( -------- The file can be read using the file name as string or an open file object: - >>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP + >>> pd.read_excel("tmp.xlsx", index_col=0) # doctest: +SKIP Name Value 0 string1 1 1 string2 2 2 #Comment 3 - >>> pd.read_excel(open('tmp.xlsx', 'rb'), - ... sheet_name='Sheet3') # doctest: +SKIP + >>> pd.read_excel(open("tmp.xlsx", "rb"), sheet_name="Sheet3") # doctest: +SKIP Unnamed: 0 Name Value 0 0 string1 1 1 1 string2 2 @@ -451,7 +447,7 @@ def read_excel( Index and header can be specified via the `index_col` and `header` arguments - >>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP + >>> pd.read_excel("tmp.xlsx", index_col=None, header=None) # doctest: +SKIP 0 1 2 0 NaN Name Value 1 0.0 string1 1 @@ -460,8 +456,9 @@ def read_excel( Column types are inferred but can be explicitly specified - >>> pd.read_excel('tmp.xlsx', index_col=0, - ... dtype={'Name': str, 'Value': float}) # doctest: +SKIP + >>> pd.read_excel( + ... "tmp.xlsx", index_col=0, dtype={"Name": str, "Value": float} + ... ) # doctest: +SKIP Name Value 0 string1 1.0 1 string2 2.0 @@ -471,8 +468,9 @@ def read_excel( but can be explicitly specified, too. Supply the values you would like as strings or lists of strings! - >>> pd.read_excel('tmp.xlsx', index_col=0, - ... na_values=['string1', 'string2']) # doctest: +SKIP + >>> pd.read_excel( + ... "tmp.xlsx", index_col=0, na_values=["string1", "string2"] + ... ) # doctest: +SKIP Name Value 0 NaN 1 1 NaN 2 @@ -481,7 +479,7 @@ def read_excel( Comment lines in the excel input file can be skipped using the ``comment`` kwarg. - >>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP + >>> pd.read_excel("tmp.xlsx", index_col=0, comment="#") # doctest: +SKIP Name Value 0 string1 1.0 1 string2 2.0 From 50843bc84cd0f1fc3464d9a77406a1fae2292c65 Mon Sep 17 00:00:00 2001 From: Fidorc80 <114183964+Fidorc80@users.noreply.github.com> Date: Sat, 29 Nov 2025 09:04:13 -0800 Subject: [PATCH 3/4] DOC: inline docstrings for read_excel with proper formatting --- pandas/io/excel/_base.py | 58 +++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a047f2d5f73f6..6e05a1299c069 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -227,7 +227,7 @@ def read_excel( * ``1``: 2nd sheet as a `DataFrame` * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" - as a dict of `DataFrame` + as a dict of `DataFrame` * ``None``: All worksheets. header : int, list of int, default 0 @@ -254,12 +254,13 @@ def read_excel( and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of both sides. * If list of int, then indicates list of column numbers to be parsed - (0-indexed). + (0-indexed). * If list of string, then indicates list of column names to be parsed. * If callable, then evaluate each column name against it and parse the column if the callable returns ``True``. Returns a subset of the columns according to behavior above. + dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} Use ``object`` to preserve data as stored in Excel and not interpret dtype, @@ -285,6 +286,7 @@ def read_excel( - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. - Otherwise ``openpyxl`` will be used. + converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -999,14 +1001,14 @@ class ExcelWriter(Generic[_WorkbookT]): mode : {{'w', 'a'}}, default 'w' File mode to use (write or append). Append does not work with fsspec URLs. storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc. For HTTP(S) URLs the key-value pairs - are forwarded to ``urllib.request.Request`` as header options. For other - URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are - forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more - details, and for more examples on storage options refer `here - `_. + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error' How to behave when trying to write to a sheet that already @@ -1443,14 +1445,14 @@ def inspect_excel_format( content_or_path : str or file-like object Path to file or content of file to inspect. May be a URL. storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc. For HTTP(S) URLs the key-value pairs - are forwarded to ``urllib.request.Request`` as header options. For other - URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are - forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more - details, and for more examples on storage options refer `here - `_. + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. Returns ------- @@ -1542,16 +1544,16 @@ class ExcelFile: Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. This is not supported, switch to using ``openpyxl`` instead. storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc. For HTTP(S) URLs the key-value pairs - are forwarded to ``urllib.request.Request`` as header options. For other - URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are - forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more - details, and for more examples on storage options refer `here - `_. - engine_kwargs : dict, optional - Arbitrary keyword arguments passed to excel engine. + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc. For HTTP(S) URLs the key-value pairs + are forwarded to ``urllib.request.Request`` as header options. For other + URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are + forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more + details, and for more examples on storage options refer `here + `_. + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. See Also -------- From caff69974c1487302218149b6c9c06220858d041 Mon Sep 17 00:00:00 2001 From: Fidorc80 <114183964+Fidorc80@users.noreply.github.com> Date: Sun, 30 Nov 2025 15:16:13 -0800 Subject: [PATCH 4/4] DOC: inline docstrings for read_excel with proper formatting. --- pandas/io/excel/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6e05a1299c069..94dbdc4f2b807 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -282,7 +282,7 @@ def read_excel( When ``engine=None``, the following logic will be used to determine the engine: - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. + then `odf `_ will be used. - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. - Otherwise ``openpyxl`` will be used. @@ -368,7 +368,7 @@ def read_excel( format. For anything more complex, please read in as ``object`` and then apply :func:`to_datetime` as-needed. - .. versionadded:: 2.0.0 + .. versionadded:: 2.0.0 thousands : str, default None Thousands separator for parsing string columns to numeric. Note that this parameter is only necessary for columns stored as TEXT in Excel,