Skip to content

File IO

csv

with header

df = spark.read.csv('data.csv', header=True, escape='\"')
df = spark.read.load('data.csv', format='csv', header=True)
display(df.limit(5)) #what is this???

without header

from pyspark.sql.types import *
from pyspark.sql.functions import *

csv_schema = StructType([
    StructField('id', IntegerType()),
    StructField('name', StringType()),
    StructField("date", DateType()),
    StructField('price', FloatType()),
])

df = spark.read.load('data.csv', format='csv', header=False, schema=csv_schema)
display(df.limit(5)) #what is this???