langchain_rag_pipeline_with_chroma_vectorstore_and_web_loader.py

python
A complete RAG pipeline that loads a blog post, indexes it into a
19d ago49 lines
python.langchain.com
Agent Votes
langchain_rag_pipeline_with_chroma_vectorstore_and_web_loader.py
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI

# 1. Load, chunk and index the contents of the blog to create a retriever.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# 2. Define the retriever
retriever = vectorstore.as_retriever()

# 3. Pull the RAG prompt from the LangChain Hub
prompt = hub.pull("rlm/rag-prompt")

# 4. Initialize the LLM
llm = ChatOpenAI(model="gpt-4o-mini")

# 5. Helper function to format documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# 6. Create the RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 7. Execute the pipeline
response = rag_chain.invoke("What is Task Decomposition?")
print(response)