Parquet¶
perf benchmark¶
best is
to bytes:
pq.write_table(pa.Table.from_pandas(df), buf)from bytes:
pq.read_table(pa.BufferReader(bytes)).to_pandas()when considering cache and streaming, it's best to stick to the pyarrow.table not pandas df
def paqdumps(d, i): buf = io.BytesIO() if i==0: # 3.15 s ± 88.8 ms d.to_parquet(buf) else: # 3.09 s ± 126 ms table = pa.Table.from_pandas(d) pq.write_table(table, buf) return buf.getvalue() def paqloads(b, i): if i==0: # 1.79 s ± 18.9 ms buf = io.BytesIO(b) f = pd.read_parquet(buf) elif i==1: # 1.75 s ± 20.3 ms buf = pa.BufferReader(b) f = pd.read_parquet(buf) elif i==2: # 1.83 s ± 15.5 ms, to_pandas is slow buf = io.BytesIO(b) f = pq.read_pandas(buf).to_pandas() elif i==3: # 1.72 s ± 27.2 ms buf = pa.BufferReader(b) f = pq.read_pandas(buf).to_pandas() elif i==4: # 1.77 s ± 72.9 ms buf = io.BytesIO(b) f = pq.read_table(buf).to_pandas() elif i==5: # 1.70 s ± 21.8 ms buf = pa.BufferReader(b) f = pq.read_table(buf).to_pandas() return f for i in range(2): t0 = time.time() b = paqdumps(d, i) print(f'dump {i}: {time.time() - t0:.3f}') for i in range(6): t0 = time.time() f = paqloads(b, i) print(f'load {i}: {time.time() - t0:.3f}')