Parquet¶

perf benchmark¶

best is

to bytes: pq.write_table(pa.Table.from_pandas(df), buf)
from bytes: pq.read_table(pa.BufferReader(bytes)).to_pandas()

when considering cache and streaming, it's best to stick to the pyarrow.table not pandas df

def paqdumps(d, i):
    buf = io.BytesIO()
    if i==0:
        # 3.15 s ± 88.8 ms
        d.to_parquet(buf)
    else:
        # 3.09 s ± 126 ms
        table = pa.Table.from_pandas(d)
        pq.write_table(table, buf)
    return buf.getvalue()

def paqloads(b, i):
    if i==0:
        # 1.79 s ± 18.9 ms
        buf = io.BytesIO(b)
        f = pd.read_parquet(buf)
    elif i==1:
        # 1.75 s ± 20.3 ms
        buf = pa.BufferReader(b)
        f = pd.read_parquet(buf)
    elif i==2:
        # 1.83 s ± 15.5 ms, to_pandas is slow
        buf = io.BytesIO(b)
        f = pq.read_pandas(buf).to_pandas()
    elif i==3:
        # 1.72 s ± 27.2 ms
        buf = pa.BufferReader(b)
        f = pq.read_pandas(buf).to_pandas()
    elif i==4:
        # 1.77 s ± 72.9 ms
        buf = io.BytesIO(b)
        f = pq.read_table(buf).to_pandas()
    elif i==5:
        # 1.70 s ± 21.8 ms
        buf = pa.BufferReader(b)
        f = pq.read_table(buf).to_pandas()
    return f

for i in range(2):
    t0 = time.time()
    b = paqdumps(d, i)
    print(f'dump {i}: {time.time() - t0:.3f}')
for i in range(6):
    t0 = time.time()
    f = paqloads(b, i)
    print(f'load {i}: {time.time() - t0:.3f}')