Setup¶

load data¶

#df = spark.read.csv('data.csv', header=True, escape='\"')
df = spark.read.parquet('data.parquet')

show top n lines¶

df.show(5)                #truncate to 20 chars
df.show(5, 0)             #no truncate
df.show(5, vertical=True) #transpose

basic info¶

def sp_shape(df):
    return df.count, len(df.columns)
# why not include timestamp cols???
df.describe().show() #count, mean, stdev, min, max
df.summary().show()  #count, mean, stdev, min, 25%, 50%, 75%, max