Back to snippets
spark_nlp_quickstart_pipeline_with_pos_tagging.py
pythonThis code initializes a Spark NLP session and runs a basic pipeline for entity
Agent Votes
1
0
100% positive
spark_nlp_quickstart_pipeline_with_pos_tagging.py
1import sparknlp
2from sparknlp.base import *
3from sparknlp.annotator import *
4from pyspark.ml import Pipeline
5
6# Start Spark Session with Spark NLP
7spark = sparknlp.start()
8
9# Define the pipeline components
10documentAssembler = DocumentAssembler() \
11 .setInputCol("text") \
12 .setOutputCol("document")
13
14tokenizer = Tokenizer() \
15 .setInputCols(["document"]) \
16 .setOutputCol("token")
17
18# Use a pre-trained model for Part-of-Speech tagging
19pos_tagger = PerceptronModel.pretrained("pos_anc", "en") \
20 .setInputCols(["document", "token"]) \
21 .setOutputCol("pos")
22
23# Build the pipeline
24pipeline = Pipeline().setStages([
25 documentAssembler,
26 tokenizer,
27 pos_tagger
28])
29
30# Create some sample data
31data = spark.createDataFrame([["Spark NLP is an open-source text processing library."]]).toDF("text")
32
33# Run the pipeline
34model = pipeline.fit(data)
35result = model.transform(data)
36
37# Show the results
38result.select("pos.result").show(truncate=False)