PyArrow Functionality

pandas can utilize PyArrow to extend functionality and improve the performance of various APIs. This includes:

  • More extensive data types compared to NumPy

  • Missing data support (NA) for all data types

  • Performant IO reader integration

  • Facilitate interoperability with other dataframe libraries based on the Apache Arrow specification (e.g. polars, cuDF)

To use this functionality, please ensure you have installed the minimum supported PyArrow version.

Data Structure Integration

A Series, Index, or the columns of a DataFrame can be directly backed by a which is similar to a NumPy array. To construct these from the main pandas data structures, you can pass in a string of the type followed by [pyarrow], e.g. "int64[pyarrow]"" into the dtype parameter

In [1]: ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-63b1e5caba5c> in <module>
----> 1 ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")

/usr/lib/python3/dist-packages/pandas/core/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
    429 
    430         if dtype is not None:
--> 431             dtype = self._validate_dtype(dtype)
    432 
    433         if data is None:

/usr/lib/python3/dist-packages/pandas/core/generic.py in _validate_dtype(cls, dtype)
    490         """validate the passed dtype"""
    491         if dtype is not None:
--> 492             dtype = pandas_dtype(dtype)
    493 
    494             # a compound dtype

/usr/lib/python3/dist-packages/pandas/core/dtypes/common.py in pandas_dtype(dtype)
   1613 
   1614     # registered extension types
-> 1615     result = registry.find(dtype)
   1616     if result is not None:
   1617         if isinstance(result, type):

/usr/lib/python3/dist-packages/pandas/core/dtypes/base.py in find(self, dtype)
    535         for dtype_type in self.dtypes:
    536             try:
--> 537                 return dtype_type.construct_from_string(dtype)
    538             except TypeError:
    539                 pass

/usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py in construct_from_string(cls, string)
   2201         base_type = string[:-9]  # get rid of "[pyarrow]"
   2202         try:
-> 2203             pa_dtype = pa.type_for_alias(base_type)
   2204         except ValueError as err:
   2205             has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [2]: ser
Out[2]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [3]: idx = pd.Index([True, None], dtype="bool[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-8f0c3c3923b5> in <module>
----> 1 idx = pd.Index([True, None], dtype="bool[pyarrow]")

/usr/lib/python3/dist-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols)
    489 
    490         if dtype is not None:
--> 491             dtype = pandas_dtype(dtype)
    492 
    493         data_dtype = getattr(data, "dtype", None)

/usr/lib/python3/dist-packages/pandas/core/dtypes/common.py in pandas_dtype(dtype)
   1613 
   1614     # registered extension types
-> 1615     result = registry.find(dtype)
   1616     if result is not None:
   1617         if isinstance(result, type):

/usr/lib/python3/dist-packages/pandas/core/dtypes/base.py in find(self, dtype)
    535         for dtype_type in self.dtypes:
    536             try:
--> 537                 return dtype_type.construct_from_string(dtype)
    538             except TypeError:
    539                 pass

/usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py in construct_from_string(cls, string)
   2201         base_type = string[:-9]  # get rid of "[pyarrow]"
   2202         try:
-> 2203             pa_dtype = pa.type_for_alias(base_type)
   2204         except ValueError as err:
   2205             has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [4]: idx
Out[4]: [0.0, 1.0, 10.0]

In [5]: df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-2a8f1e29c469> in <module>
----> 1 df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")

/usr/lib/python3/dist-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    672     ) -> None:
    673         if dtype is not None:
--> 674             dtype = self._validate_dtype(dtype)
    675 
    676         if isinstance(data, DataFrame):

/usr/lib/python3/dist-packages/pandas/core/generic.py in _validate_dtype(cls, dtype)
    490         """validate the passed dtype"""
    491         if dtype is not None:
--> 492             dtype = pandas_dtype(dtype)
    493 
    494             # a compound dtype

/usr/lib/python3/dist-packages/pandas/core/dtypes/common.py in pandas_dtype(dtype)
   1613 
   1614     # registered extension types
-> 1615     result = registry.find(dtype)
   1616     if result is not None:
   1617         if isinstance(result, type):

/usr/lib/python3/dist-packages/pandas/core/dtypes/base.py in find(self, dtype)
    535         for dtype_type in self.dtypes:
    536             try:
--> 537                 return dtype_type.construct_from_string(dtype)
    538             except TypeError:
    539                 pass

/usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py in construct_from_string(cls, string)
   2201         base_type = string[:-9]  # get rid of "[pyarrow]"
   2202         try:
-> 2203             pa_dtype = pa.type_for_alias(base_type)
   2204         except ValueError as err:
   2205             has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [6]: df
Out[6]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

Note

The string alias "string[pyarrow]" maps to pd.StringDtype("pyarrow") which is not equivalent to specifying dtype=pd.ArrowDtype(pa.string()). Generally, operations on the data will behave similarly except pd.StringDtype("pyarrow") can return NumPy-backed nullable types while pd.ArrowDtype(pa.string()) will return ArrowDtype.

In [7]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-7-852643f3aad4> in <module>
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [8]: data = list("abc")

In [9]: ser_sd = pd.Series(data, dtype="string[pyarrow]")
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-9-129957706f1c> in <module>
----> 1 ser_sd = pd.Series(data, dtype="string[pyarrow]")

/usr/lib/python3/dist-packages/pandas/core/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
    429 
    430         if dtype is not None:
--> 431             dtype = self._validate_dtype(dtype)
    432 
    433         if data is None:

/usr/lib/python3/dist-packages/pandas/core/generic.py in _validate_dtype(cls, dtype)
    490         """validate the passed dtype"""
    491         if dtype is not None:
--> 492             dtype = pandas_dtype(dtype)
    493 
    494             # a compound dtype

/usr/lib/python3/dist-packages/pandas/core/dtypes/common.py in pandas_dtype(dtype)
   1613 
   1614     # registered extension types
-> 1615     result = registry.find(dtype)
   1616     if result is not None:
   1617         if isinstance(result, type):

/usr/lib/python3/dist-packages/pandas/core/dtypes/base.py in find(self, dtype)
    535         for dtype_type in self.dtypes:
    536             try:
--> 537                 return dtype_type.construct_from_string(dtype)
    538             except TypeError:
    539                 pass

/usr/lib/python3/dist-packages/pandas/core/arrays/string_.py in construct_from_string(cls, string)
    170             return cls(storage="python")
    171         elif string == "string[pyarrow]":
--> 172             return cls(storage="pyarrow")
    173         elif string == "string[pyarrow_numpy]":
    174             return cls(storage="pyarrow_numpy")

/usr/lib/python3/dist-packages/pandas/core/arrays/string_.py in __init__(self, storage)
    124             )
    125         if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0:
--> 126             raise ImportError(
    127                 "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
    128             )

ImportError: pyarrow>=7.0.0 is required for PyArrow backed StringArray.

In [10]: ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-de34c7f945d7> in <module>
----> 1 ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))

NameError: name 'pa' is not defined

In [11]: ser_ad.dtype == ser_sd.dtype
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-35e971920e72> in <module>
----> 1 ser_ad.dtype == ser_sd.dtype

NameError: name 'ser_ad' is not defined

In [12]: ser_sd.str.contains("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-9f1e70822d42> in <module>
----> 1 ser_sd.str.contains("a")

NameError: name 'ser_sd' is not defined

In [13]: ser_ad.str.contains("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-8c1e712cab71> in <module>
----> 1 ser_ad.str.contains("a")

NameError: name 'ser_ad' is not defined

For PyArrow types that accept parameters, you can pass in a PyArrow type with those parameters into ArrowDtype to use in the dtype parameter.

In [14]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-14-852643f3aad4> in <module>
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [15]: list_str_type = pa.list_(pa.string())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-0ca69e3e396b> in <module>
----> 1 list_str_type = pa.list_(pa.string())

NameError: name 'pa' is not defined

In [16]: ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-e138fa44dc3a> in <module>
----> 1 ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))

NameError: name 'list_str_type' is not defined

In [17]: ser
Out[17]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64
In [18]: from datetime import time

In [19]: idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-2582f29eb373> in <module>
----> 1 idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))

NameError: name 'pa' is not defined

In [20]: idx
Out[20]: [0.0, 1.0, 10.0]
In [21]: from decimal import Decimal

In [22]: decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-b33399ccf112> in <module>
----> 1 decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))

NameError: name 'pa' is not defined

In [23]: data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]

In [24]: df = pd.DataFrame(data, dtype=decimal_type)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-54c9b5266dfd> in <module>
----> 1 df = pd.DataFrame(data, dtype=decimal_type)

NameError: name 'decimal_type' is not defined

In [25]: df
Out[25]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

If you already have an or , you can pass it into arrays.ArrowExtensionArray to construct the associated Series, Index or DataFrame object.

In [26]: pa_array = pa.array(
   ....:     [{"1": "2"}, {"10": "20"}, None],
   ....:     type=pa.map_(pa.string(), pa.string()),
   ....: )
   ....: 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-26-47c398bfa275> in <module>
----> 1 pa_array = pa.array(
      2     [{"1": "2"}, {"10": "20"}, None],
      3     type=pa.map_(pa.string(), pa.string()),
      4 )

NameError: name 'pa' is not defined

In [27]: ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-1002c96b04a8> in <module>
----> 1 ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))

NameError: name 'pa_array' is not defined

In [28]: ser
Out[28]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

To retrieve a pyarrow from a Series or Index, you can call the pyarrow array constructor on the Series or Index.

In [29]: ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-29-8efb65ed6b64> in <module>
----> 1 ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")

/usr/lib/python3/dist-packages/pandas/core/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
    429 
    430         if dtype is not None:
--> 431             dtype = self._validate_dtype(dtype)
    432 
    433         if data is None:

/usr/lib/python3/dist-packages/pandas/core/generic.py in _validate_dtype(cls, dtype)
    490         """validate the passed dtype"""
    491         if dtype is not None:
--> 492             dtype = pandas_dtype(dtype)
    493 
    494             # a compound dtype

/usr/lib/python3/dist-packages/pandas/core/dtypes/common.py in pandas_dtype(dtype)
   1613 
   1614     # registered extension types
-> 1615     result = registry.find(dtype)
   1616     if result is not None:
   1617         if isinstance(result, type):

/usr/lib/python3/dist-packages/pandas/core/dtypes/base.py in find(self, dtype)
    535         for dtype_type in self.dtypes:
    536             try:
--> 537                 return dtype_type.construct_from_string(dtype)
    538             except TypeError:
    539                 pass

/usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py in construct_from_string(cls, string)
   2201         base_type = string[:-9]  # get rid of "[pyarrow]"
   2202         try:
-> 2203             pa_dtype = pa.type_for_alias(base_type)
   2204         except ValueError as err:
   2205             has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [30]: pa.array(ser)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-30-1dcc395634e8> in <module>
----> 1 pa.array(ser)

NameError: name 'pa' is not defined

In [31]: idx = pd.Index(ser)

In [32]: pa.array(idx)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-32-7c99dd1103e5> in <module>
----> 1 pa.array(idx)

NameError: name 'pa' is not defined

To convert a to a DataFrame, you can call the method with types_mapper=pd.ArrowDtype.

In [33]: table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-33-032ec84d4bf4> in <module>
----> 1 table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])

NameError: name 'pa' is not defined

In [34]: df = table.to_pandas(types_mapper=pd.ArrowDtype)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-34-64ec62289cb4> in <module>
----> 1 df = table.to_pandas(types_mapper=pd.ArrowDtype)

/usr/lib/python3/dist-packages/pandas/core/generic.py in __getattr__(self, name)
   6202         ):
   6203             return self[name]
-> 6204         return object.__getattribute__(self, name)
   6205 
   6206     @final

AttributeError: 'DataFrame' object has no attribute 'to_pandas'

In [35]: df
Out[35]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

In [36]: df.dtypes
Out[36]: 
a    object
b    object
dtype: object

Operations

PyArrow data structure integration is implemented through pandas’ ExtensionArray interface; therefore, supported functionality exists where this interface is integrated within the pandas API. Additionally, this functionality is accelerated with PyArrow compute functions where available. This includes:

  • Numeric aggregations

  • Numeric arithmetic

  • Numeric rounding

  • Logical and comparison functions

  • String functionality

  • Datetime functionality

The following are just some examples of operations that are accelerated by native PyArrow compute functions.

In [37]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-37-852643f3aad4> in <module>
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [38]: ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-38-87d7ea9fa799> in <module>
----> 1 ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")

/usr/lib/python3/dist-packages/pandas/core/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
    429 
    430         if dtype is not None:
--> 431             dtype = self._validate_dtype(dtype)
    432 
    433         if data is None:

/usr/lib/python3/dist-packages/pandas/core/generic.py in _validate_dtype(cls, dtype)
    490         """validate the passed dtype"""
    491         if dtype is not None:
--> 492             dtype = pandas_dtype(dtype)
    493 
    494             # a compound dtype

/usr/lib/python3/dist-packages/pandas/core/dtypes/common.py in pandas_dtype(dtype)
   1613 
   1614     # registered extension types
-> 1615     result = registry.find(dtype)
   1616     if result is not None:
   1617         if isinstance(result, type):

/usr/lib/python3/dist-packages/pandas/core/dtypes/base.py in find(self, dtype)
    535         for dtype_type in self.dtypes:
    536             try:
--> 537                 return dtype_type.construct_from_string(dtype)
    538             except TypeError:
    539                 pass

/usr/lib/python3/dist-packages/pandas/core/dtypes/dtypes.py in construct_from_string(cls, string)
   2201         base_type = string[:-9]  # get rid of "[pyarrow]"
   2202         try:
-> 2203             pa_dtype = pa.type_for_alias(base_type)
   2204         except ValueError as err:
   2205             has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [39]: ser.mean()
Out[39]: 2.0

In [40]: ser + ser
Out[40]: 
0    0.0
1    2.0
2    4.0
3    6.0
4    8.0
dtype: float64

In [41]: ser > (ser + 1)
Out[41]: 
0    False
1    False
2    False
3    False
4    False
dtype: bool

In [42]: ser.dropna()
Out[42]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [43]: ser.isna()
Out[43]: 
0    False
1    False
2    False
3    False
4    False
dtype: bool

In [44]: ser.fillna(0)
Out[44]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64
In [45]: ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-45-36b4e12fb968> in <module>
----> 1 ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))

NameError: name 'pa' is not defined

In [46]: ser_str.str.startswith("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-46-420f96afcbff> in <module>
----> 1 ser_str.str.startswith("a")

NameError: name 'ser_str' is not defined
In [47]: from datetime import datetime

In [48]: pa_type = pd.ArrowDtype(pa.timestamp("ns"))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-48-561eb9871ca8> in <module>
----> 1 pa_type = pd.ArrowDtype(pa.timestamp("ns"))

NameError: name 'pa' is not defined

In [49]: ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-49-21504a973ac8> in <module>
----> 1 ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)

NameError: name 'pa_type' is not defined

In [50]: ser_dt.dt.strftime("%Y-%m")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-50-903aff7556ca> in <module>
----> 1 ser_dt.dt.strftime("%Y-%m")

NameError: name 'ser_dt' is not defined

I/O Reading

PyArrow also provides IO reading functionality that has been integrated into several pandas IO readers. The following functions provide an engine keyword that can dispatch to PyArrow to accelerate reading from an IO source.

In [51]: import io

In [52]: data = io.StringIO("""a,b,c
   ....:    1,2.5,True
   ....:    3,4.5,False
   ....: """)
   ....: 

In [53]: df = pd.read_csv(data, engine="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
/usr/lib/python3/dist-packages/pandas/compat/_optional.py in import_optional_dependency(name, extra, errors, min_version)
    136     try:
--> 137         module = importlib.import_module(name)
    138     except ImportError:

/usr/lib/python3.11/importlib/__init__.py in import_module(name, package)
    125             level += 1
--> 126     return _bootstrap._gcd_import(name[level:], package, level)
    127 

/usr/lib/python3.11/importlib/_bootstrap.py in _gcd_import(name, package, level)

/usr/lib/python3.11/importlib/_bootstrap.py in _find_and_load(name, import_)

/usr/lib/python3.11/importlib/_bootstrap.py in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
<ipython-input-53-47696af33742> in <module>
----> 1 df = pd.read_csv(data, engine="pyarrow")

/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    946     kwds.update(kwds_defaults)
    947 
--> 948     return _read(filepath_or_buffer, kwds)
    949 
    950 

/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    615 
    616     with parser:
--> 617         return parser.read(nrows)
    618 
    619 

/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py in read(self, nrows)
   1734             try:
   1735                 # error: "ParserBase" has no attribute "read"
-> 1736                 df = self._engine.read()  # type: ignore[attr-defined]
   1737             except Exception:
   1738                 self.close()

/usr/lib/python3/dist-packages/pandas/io/parsers/arrow_parser_wrapper.py in read(self)
    187             The DataFrame created from the CSV file.
    188         """
--> 189         pa = import_optional_dependency("pyarrow")
    190         pyarrow_csv = import_optional_dependency("pyarrow.csv")
    191         self._get_pyarrow_options()

/usr/lib/python3/dist-packages/pandas/compat/_optional.py in import_optional_dependency(name, extra, errors, min_version)
    138     except ImportError:
    139         if errors == "raise":
--> 140             raise ImportError(msg)
    141         return None
    142 

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [54]: df
Out[54]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

By default, these functions and all other IO reader functions return NumPy-backed data. These readers can return PyArrow-backed data by specifying the parameter dtype_backend="pyarrow". A reader does not need to set engine="pyarrow" to necessarily return PyArrow-backed data.

In [55]: import io

In [56]: data = io.StringIO("""a,b,c,d,e,f,g,h,i
   ....:     1,2.5,True,a,,,,,
   ....:     3,4.5,False,b,6,7.5,True,a,
   ....: """)
   ....: 

In [57]: df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
/usr/lib/python3/dist-packages/pandas/compat/_optional.py in import_optional_dependency(name, extra, errors, min_version)
    136     try:
--> 137         module = importlib.import_module(name)
    138     except ImportError:

/usr/lib/python3.11/importlib/__init__.py in import_module(name, package)
    125             level += 1
--> 126     return _bootstrap._gcd_import(name[level:], package, level)
    127 

/usr/lib/python3.11/importlib/_bootstrap.py in _gcd_import(name, package, level)

/usr/lib/python3.11/importlib/_bootstrap.py in _find_and_load(name, import_)

/usr/lib/python3.11/importlib/_bootstrap.py in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
<ipython-input-57-1084ac23d87a> in <module>
----> 1 df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")

/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    946     kwds.update(kwds_defaults)
    947 
--> 948     return _read(filepath_or_buffer, kwds)
    949 
    950 

/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    609 
    610     # Create the parser.
--> 611     parser = TextFileReader(filepath_or_buffer, **kwds)
    612 
    613     if chunksize or iterator:

/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
   1446 
   1447         self.handles: IOHandles | None = None
-> 1448         self._engine = self._make_engine(f, self.engine)
   1449 
   1450     def close(self) -> None:

/usr/lib/python3/dist-packages/pandas/io/parsers/readers.py in _make_engine(self, f, engine)
   1721 
   1722         try:
-> 1723             return mapping[engine](f, **self.options)
   1724         except Exception:
   1725             if self.handles is not None:

/usr/lib/python3/dist-packages/pandas/io/parsers/c_parser_wrapper.py in __init__(self, src, **kwds)
     90         if kwds["dtype_backend"] == "pyarrow":
     91             # Fail here loudly instead of in cython after reading
---> 92             import_optional_dependency("pyarrow")
     93         self._reader = parsers.TextReader(src, **kwds)
     94 

/usr/lib/python3/dist-packages/pandas/compat/_optional.py in import_optional_dependency(name, extra, errors, min_version)
    138     except ImportError:
    139         if errors == "raise":
--> 140             raise ImportError(msg)
    141         return None
    142 

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [58]: df_pyarrow.dtypes
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-58-7afbf1a65569> in <module>
----> 1 df_pyarrow.dtypes

NameError: name 'df_pyarrow' is not defined

Several non-IO reader functions can also use the dtype_backend argument to return PyArrow-backed data including: