Skip to content

Group

select and where

d = (
    df
    .select('name', 'price')
    .where((df['id']==1) | (df['id']==3))
)
display(d)

count

d = (
    df
    .select('id', 'name')
    .groupBy('name')
    .count()
)
display(d)

count distinct

d = df.groupBy('Country').agg(countDistinct('CustomerID').alias('country_count')).orderBy(desc('country_count'))
d.show()

min/max

#d1.agg({'InvoiceDate': 'min', 'InvoiceDate': 'max'}).first()[0] # not work as dict cannot have same keys
from pyspark.sql import functions as f
d = df.agg(
    f.min('InvoiceDate').alias('mn'),
    f.max('InvoiceDate').alias('mx'),
    f.avg('InvoiceDate').alias('ag'),
    f.sum('InvoiceDate').alias('sm'),
).first()
mn, mx, ag, sm = d #d is a Row, we can also get mn by d.mn or d['mn'] or d[:1]