Setup
load data
#df = spark.read.csv('data.csv', header=True, escape='\"')
df = spark.read.parquet('data.parquet')
show top n lines
df.show(5) #truncate to 20 chars
df.show(5, 0) #no truncate
df.show(5, vertical=True) #transpose
basic info
def sp_shape(df):
return df.count, len(df.columns)
# why not include timestamp cols???
df.describe().show() #count, mean, stdev, min, max
df.summary().show() #count, mean, stdev, min, 25%, 50%, 75%, max