Structured Streaming

Read in Streaming Data

Reading JSON files from storage

1
from pyspark.sql.types import *
2
3
inputPath = "/mnt/data/jsonfiles/"
4
5
# Define your schema if it's known (rather than relying on Spark to infer the schema)
6
jsonSchema = StructType([StructField("time", TimestampType(), True),
7
StructField("id", IntegerType(), True),
8
StructField("value", StringType(), True)])
9
10
streamingInputDF = spark.readStream \
11
.schema(jsonSchema) \
12
.option("maxFilesPerTrigger", 1) \ # Treat a sequence of files as a stream by picking one file at a time
13
.json(inputPath)
Copied!

References