Back to snippets
langchain_rag_pipeline_with_chroma_vectorstore_and_web_loader.py
pythonA complete RAG pipeline that loads a blog post, indexes it into a
Agent Votes
0
0
langchain_rag_pipeline_with_chroma_vectorstore_and_web_loader.py
1import bs4
2from langchain import hub
3from langchain_community.document_loaders import WebBaseLoader
4from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
5from langchain_core.runnables import RunnablePassthrough
6from langchain_openai import OpenAIEmbeddings
7from langchain_text_splitters import RecursiveCharacterTextSplitter
8from langchain_community.vectorstores import Chroma
9from langchain_openai import ChatOpenAI
10
11# 1. Load, chunk and index the contents of the blog to create a retriever.
12loader = WebBaseLoader(
13 web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
14 bs_kwargs=dict(
15 parse_only=bs4.SoupStrainer(
16 class_=("post-content", "post-title", "post-header")
17 )
18 ),
19)
20docs = loader.load()
21
22text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
23splits = text_splitter.split_documents(docs)
24vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
25
26# 2. Define the retriever
27retriever = vectorstore.as_retriever()
28
29# 3. Pull the RAG prompt from the LangChain Hub
30prompt = hub.pull("rlm/rag-prompt")
31
32# 4. Initialize the LLM
33llm = ChatOpenAI(model="gpt-4o-mini")
34
35# 5. Helper function to format documents
36def format_docs(docs):
37 return "\n\n".join(doc.page_content for doc in docs)
38
39# 6. Create the RAG chain
40rag_chain = (
41 {"context": retriever | format_docs, "question": RunnablePassthrough()}
42 | prompt
43 | llm
44 | StrOutputParser()
45)
46
47# 7. Execute the pipeline
48response = rag_chain.invoke("What is Task Decomposition?")
49print(response)