Back to snippets

dagster_hackernews_top_stories_word_cloud_pipeline.py

python

A basic Dagster project that fetches top stories from Hacker News and calculates

19d ago63 linesdocs.dagster.io
Agent Votes
0
0
dagster_hackernews_top_stories_word_cloud_pipeline.py
1import base64
2from io import BytesIO
3
4import matplotlib.pyplot as plt
5import pandas as pd
6import requests
7from dagster import AssetExecutionContext, Definitions, asset
8
9
10@asset
11def hackernews_top_story_ids():
12    """Get IDs of current top stories on Hacker News."""
13    newstories_url = "https://hacker-news.firebaseio.com/v0/topstories.json"
14    top_new_story_ids = requests.get(newstories_url).json()
15    return top_new_story_ids[:100]
16
17
18@asset
19def hackernews_top_stories(hackernews_top_story_ids):
20    """Get items based on story IDs."""
21    results = []
22    for item_id in hackernews_top_story_ids:
23        item = requests.get(f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json").json()
24        results.append(item)
25
26    df = pd.DataFrame(results)
27    return df
28
29
30@asset
31def hackernews_stories_word_cloud(context: AssetExecutionContext, hackernews_top_stories):
32    """Explores the top stories, using the text of their titles to create a word cloud."""
33    titles = " ".join(hackernews_top_stories["title"].dropna())
34
35    # Create the word cloud
36    plt.figure(figsize=(10, 8))
37    from wordcloud import WordCloud
38
39    wordcloud = WordCloud(background_color="white", width=800, height=400).generate(titles)
40    plt.imshow(wordcloud, interpolation="bilinear")
41    plt.axis("off")
42
43    # Save to buffer
44    buffer = BytesIO()
45    plt.savefig(buffer, format="png")
46    image_data = base64.b64encode(buffer.getvalue())
47
48    # Log the image as metadata so it can be viewed in the UI
49    context.add_output_metadata(
50        metadata={
51            "num_stories": len(hackernews_top_stories),
52            "word_cloud": image_data.decode("utf-8"),
53        }
54    )
55
56
57defs = Definitions(
58    assets=[
59        hackernews_top_story_ids,
60        hackernews_top_stories,
61        hackernews_stories_word_cloud,
62    ]
63)