Broadcast¶

multiindex¶

reindex with level can broadcast the level. also df.div can broadcast only one level.

not good, will repeat all combinations:

va = np.array([[1991,1992],[6,7],[8,9],[1,2],[3,4]]).T
df = pd.DataFrame(va,columns=['a','b','c','x','y']).set_index(['a','b','c'])

d2 = df.groupby(['a','c']).mean()
d3 = d2.reindex(df.unstack('b').columns, axis=1, level=0)
d3 = d3.stack(level=1).reorder_levels(['a','b','c'])

broadcast with reindex¶

get grouped mean and then broadcast back to the original df rows.

va = np.array([[1991,1991,1992],[6,7,7],[8,8,10],[1,2,3],[3,4,5]]).T
df = pd.DataFrame(va,columns=['a','b','c','x','y']).set_index(['a','b','c'])

ia = df.index.get_level_values('a')
ic = df.index.get_level_values('c')
mi = pd.MultiIndex.from_arrays([ia,ic],names=['a','c'])
d2 = df.groupby(['a','c']).mean()
d2.reindex(mi)
d2.reindex(df.index.droplevel('b')) #better? it seems reindex is faster than loc for single index
d2.loc[df.index.droplevel('b'),:] #best?

# we can also use merge to combine the grouped mean df to the original df without reindex
# note df has index a,b,c and d2 has index a,c
df_merged = df.merge(d2, left_index=True, right_index=True)

Expand df date ranges to individual rows¶

df = (
    df
    .assign(
        DATE=lambda x:
            [
                pd.date_range(row.STARTDATE, row.ENDDATE, freq='d')
                for _,row in x.iterrows()
            ]
    )
    .explode('DATE')
    .drop(['STARTDATE', 'ENDDATE'], axis=1)
)

At least 10x faster:

def explode_date_range(
    df: pd.DataFrame,
    start_date_col: str,
    end_date_col: str,
    new_date_col: str = 'ts',
    freq: str = '30T',
    end_date_offset: pd.DateOffset = None,
) -> pd.DataFrame:
    df = df.reset_index(drop=True)
    if end_date_offset is not None:
        df[end_date_col] += end_date_offset
    # Get exploded timestamp column
    dt = (
        pd.concat([
            pd.DataFrame({
                'i': i,
                'ts': pd.date_range(start=s, end=e, freq=freq)
            })
            for i, (s, e) in enumerate(zip(df[start_date_col], df[end_date_col]))
        ])
        .set_index('i')
        .rename_axis(None, axis=0)
    )
    # Re-sample df based on new timestamp column
    df = (
        df
        .drop(columns=[start_date_col, end_date_col])
        .reindex(dt.index)
        .assign(ds=dt.ts)
        .rename(columns={'ds': new_date_col})
    )
    return df