Skip to content

Benchmark

parquet has a slower dump speed but is much faster when load back the data and size is much smaller.

df to bytes to df

average of 5 runs w/o redis

   name  dump (redis)  load (redis)    MB
pickle    0.7   (1.9)   1.0   (3.9) 150.5
feather   1.4   (1.8)   0.7   (1.6)  56.7
parquet   2.9   (3.1)   0.9   (1.4)  26.5

pa.Table to bytes to pa.Table

average of 5 runs w/o redis

   name  dump (redis)  load (redis)    MB
 pickle   1.1   (2.7)   0.5   (6.0) 269.3
feather   0.5   (0.8)   0.1   (1.2)  56.7
fea(zstd)       (0.8)         (1.0)  42.1
parquet   1.8   (2.1)   0.3   (1.0)  26.5
paq(zstd)       (2.0)         (0.7)  21.0

code

import io
import pickle
from functools import partial
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.feather as pf
import time

test_redis = False

def timeit(func, n=5):
    t0 = time.time()
    for i in range(n):
        func()
    return (time.time() - t0) / n

def pkldumps(d):    
    b = pickle.dumps(d)
    if test_redis:
        r.set('df', b)
    return b

def pklloads(b):
    if test_redis:
        b = r.get('df')    
    return pickle.loads(b)

def paqdumps(d):
    buf = io.BytesIO()
    #pq.write_table(pa.Table.from_pandas(d), buf)       
    pq.write_table(d, buf)      
    b = buf.getvalue()
    if test_redis:
        r.set('df', b)
    return b

def paqloads(b):
    if test_redis:
        b = r.get('df')    
    buf = pa.BufferReader(b)
    #return pq.read_table(buf).to_pandas()  
    return pq.read_table(buf)

def feadumps(d):
    buf = io.BytesIO()
    #d.to_feather(buf)
    pf.write_feather(d, buf)
    b = buf.getvalue()
    if test_redis:
        r.set('df', b)
    return b

def fealoads(b):
    if test_redis:
        b = r.get('df')     
    buf = io.BytesIO(b)
    #return pd.read_feather(buf)
    return pf.read_table(buf)

def hdfdumps(d):
    buf = io.BytesIO()
    d.to_hdf(buf, 'bar', mode='w') #cannot be io stream
    return buf.getvalue()

def hdfloads(b):
    buf = io.BytesIO(b)
    return pd.read_hdf(buf, 'bar', mode='r')

dc = {
    'pickle': [pklloads, pkldumps],
    'feather': [fealoads, feadumps],
    'parquet': [paqloads, paqdumps],
    #'hdf': [hdfloads, hdfdumps],  not supported
}

result = []
for name, (loads, dumps) in dc.items():
    print(name)
    b = dumps(df)
    s = len(b) / 1024 / 1024
    result.append([name, timeit(lambda: dumps(df)), timeit(lambda: loads(b)), s])
dt = pd.DataFrame(result, columns=['name', 'dump', 'load', 'MB'])
print(dt.round(1).to_string(index=False))