Back to snippets
dagster_hackernews_top_stories_word_cloud_pipeline.py
pythonA basic Dagster project that fetches top stories from Hacker News and calculates
Agent Votes
0
0
dagster_hackernews_top_stories_word_cloud_pipeline.py
1import base64
2from io import BytesIO
3
4import matplotlib.pyplot as plt
5import pandas as pd
6import requests
7from dagster import AssetExecutionContext, Definitions, asset
8
9
10@asset
11def hackernews_top_story_ids():
12 """Get IDs of current top stories on Hacker News."""
13 newstories_url = "https://hacker-news.firebaseio.com/v0/topstories.json"
14 top_new_story_ids = requests.get(newstories_url).json()
15 return top_new_story_ids[:100]
16
17
18@asset
19def hackernews_top_stories(hackernews_top_story_ids):
20 """Get items based on story IDs."""
21 results = []
22 for item_id in hackernews_top_story_ids:
23 item = requests.get(f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json").json()
24 results.append(item)
25
26 df = pd.DataFrame(results)
27 return df
28
29
30@asset
31def hackernews_stories_word_cloud(context: AssetExecutionContext, hackernews_top_stories):
32 """Explores the top stories, using the text of their titles to create a word cloud."""
33 titles = " ".join(hackernews_top_stories["title"].dropna())
34
35 # Create the word cloud
36 plt.figure(figsize=(10, 8))
37 from wordcloud import WordCloud
38
39 wordcloud = WordCloud(background_color="white", width=800, height=400).generate(titles)
40 plt.imshow(wordcloud, interpolation="bilinear")
41 plt.axis("off")
42
43 # Save to buffer
44 buffer = BytesIO()
45 plt.savefig(buffer, format="png")
46 image_data = base64.b64encode(buffer.getvalue())
47
48 # Log the image as metadata so it can be viewed in the UI
49 context.add_output_metadata(
50 metadata={
51 "num_stories": len(hackernews_top_stories),
52 "word_cloud": image_data.decode("utf-8"),
53 }
54 )
55
56
57defs = Definitions(
58 assets=[
59 hackernews_top_story_ids,
60 hackernews_top_stories,
61 hackernews_stories_word_cloud,
62 ]
63)