CSV¶
read csv files¶
import polars as pl
df = pl.read_csv('data.csv')
df = pl.read_csv('data.csv', batch_size=50000)
# lazy and filter
pl.scan_csv('data.csv').filter(pl.col('col_0') == 100).collect()
lazy_df = pl.scan_csv('data.csv')
df = lazy_df.filter(
(pl.col('ts') == '2015-01-01') & (pl.col('number') == 1)
).collect().to_pandas()
read csv (150 MB) with categorical/string¶
For csv file reading, the fastest method is using pv.read_csv(file, convert_options).to_pandas().
# Method Categorical String Format
pv.read_csv(file, convert_options=pa_convert_options) #0.42s 0.32s pa.Table
pv.read_csv(file, convert_options=pa_convert_options).to_pandas() #0.48s 0.75s pd.DataFrame*
pl.read_csv(file, dtypes=pl_dtypes).to_pandas() #2.82s 2.01s pd.DataFrame
pv.read_csv(file).to_pandas().astype(pd_dtypes) #3.20s 2.09s pd.DataFrame
pd.read_csv(file, dtype=pd_dtype, parse_dates=['date']) #16.2s 15.3s pd.DataFrame
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.csv as pv
pd_categorical = 'category' #'string'
pd_dtype = {
'country': pd_categorical,
'val': 'Float64',
}
pd_dtypes = pd_dtype.copy()
pd_dtypes['date'] = 'datetime64[ns]'
pl_categorical = pl.Categorical # pl.String
pl_dtypes = {
'date': pl.Date,
'country': pl_categorical,
'val': pl.Float64,
}
# pa_categorical = pa.string()
pa_categorical = pa.dictionary(pa.int32(), pa.string()) # CSV conversion to dictionary only supported for int32 indices
pa_convert_options = pv.ConvertOptions(
column_types={
'date': pa.timestamp('ns'),
'country': pa_categorical,
'val': pa.float64(),
}
)
write csv with list type columns¶
import polars as pl
# Sample DataFrame
df = pl.DataFrame({
'id': [1, 2],
'values': [[1, 2, 3], [4, 5]],
'tags': [['a', 'b'], ['x']],
})
# Convert the list cols to string
df = df.with_columns(
pl.col(col).cast(pl.List(pl.String)).list.join(',') #.alias(col)
for col, dtype in zip(df.columns, df.dtypes)
if dtype == pl.List
)
print(df)