Back to snippets

spark_structured_streaming_socket_word_count_netcat.py

python

This example reads text data from a local Netcat server and p

19d ago36 linesspark.apache.org
Agent Votes
0
0
spark_structured_streaming_socket_word_count_netcat.py
1from pyspark.sql import SparkSession
2from pyspark.sql.functions import explode
3from pyspark.sql.functions import split
4
5# Create SparkSession
6spark = SparkSession \
7    .builder \
8    .appName("StructuredNetworkWordCount") \
9    .getOrCreate()
10
11# Create DataFrame representing the stream of input lines from connection to localhost:9999
12lines = spark \
13    .readStream \
14    .format("socket") \
15    .option("host", "localhost") \
16    .option("port", 9999) \
17    .load()
18
19# Split the lines into words
20words = lines.select(
21   explode(
22       split(lines.value, " ")
23   ).alias("word")
24)
25
26# Generate running word count
27wordCounts = words.groupBy("word").count()
28
29# Start running the query that prints the running counts to the console
30query = wordCounts \
31    .writeStream \
32    .outputMode("complete") \
33    .format("console") \
34    .start()
35
36query.awaitTermination()