An End-to-End RAG example using faiss retriver using langchain and openai gpt-3.5 for QA#
This notebook presents a comprehensive end-to-end example utilizing the library’s functionality. Specifically, it showcases how to use a RAG (Retrieval-Augmented Generation) model, powered by GPT-3.5, to retrieve information. This notebook provides insights into leveraging the library for complex use cases.
[ ]:
!pip install "antimatter[langchain]"
!pip install python-dotenv openai
Import openai key from a .env file.#
[ ]:
import dotenv
import os
dotenv.load_dotenv(os.path.join(os.getenv("HOME"), '.openai_env'))
Register a domain and create a read/write context#
[ ]:
import os
from antimatter import new_domain, Session
from antimatter.builders import *
from antimatter.datatype.datatypes import Datatype
[ ]:
# Either create a new domain or use an existing one
if True:
sess = new_domain("<my_email_address>")
print ("domain: %s" % (sess.domain_id))
print(f"sess = Session.from_api_key(domain_id='{sess.domain_id}', api_key='{sess.api_key}')")
else:
sess = Session.from_api_key(domain_id='<domain_id>', api_key='<api_key>')
file_name = "/tmp/testdata.capsule"
Add some facts to this domain#
Create a fact type called is_project_member
with the attributes email
and project
. Add 2 facts to this type: - is_project_member(email="test@test.com", project="project1")
- is_project_member(email="test2@test2.com", project="project2")
[ ]:
sess.add_fact_type(
"is_project_member",
description="Team membership",
arguments={"email": "email of the member", "project": "name of the project"},
)
sess.add_fact(
"is_project_member",
"[email protected]",
"project1",
)
sess.add_fact(
"is_project_member",
"[email protected]",
"project2",
)
[ ]:
sess.list_facts('is_project_member')
Open a dataset#
[ ]:
# Load dataset
import pandas as pd
data = [
{"id":1,"first_name":"Amanda","last_name":"Jordan","email":"[email protected]","gender":"Female","ip_address":"1.197.201.2","cc":"6759521864920116","country":"Indonesia","birthdate":"3\\/8\\/1971","salary":49756.53,"title":"Internal Auditor","comments":"Hello friends, my name is Alice Johnson and I just turned 29 years old! \\ud83c\\udf89 I am looking forward to connecting with all of you. Feel free to drop me a line at [email protected] or call me at 415-123-4567."},
{"id":2,"first_name":"Albert","last_name":"Freeman","email":"[email protected]","gender":"Male","ip_address":"218.111.175.34","cc":"","country":"Canada","birthdate":"1\\/16\\/1968","salary":150280.17,"title":"Accountant IV","comments":"Customer feedback: I recently visited your store at 5678 Pine Avenue, Dallas, TX 75201. My name is Jane Doe, age 43. I had a wonderful experience and the staff was very friendly. You can reach out to me at [email protected] for any further details."},
{"id":3,"first_name":"Evelyn","last_name":"Morgan","email":"[email protected]","gender":"Female","ip_address":"7.161.136.94","cc":"6767119071901597","country":"Russia","birthdate":"2\\/1\\/1960","salary":144972.51,"title":"Structural Engineer","comments":"Booking Confirmation: Thank you, David Smith (DOB: 01\\/12\\/1978) for booking with us. We have received your payment through the credit card ending with 1234. Your booking ID is #67890. Please save this email for your records. For any queries, contact us at [email protected]."},
]
df = pd.DataFrame(data)
df.head()
List and create write context#
[ ]:
sess.list_write_context()
[ ]:
# Create a new write context
sess.add_write_context(
"write_ctx", WriteContextBuilder().\
set_summary("Sample write context").\
set_description("Sample description").\
add_hook(Hook.Fast)
)
[ ]:
sess.list_write_context()
Encapsulate data using the write context#
[ ]:
df_capsule = sess.encapsulate(data=df, write_context="write_ctx", path=file_name)
[ ]:
!ls -lrtha /tmp/testdata.capsule
List & Create read contexts#
[ ]:
sess.list_read_context()
[ ]:
sess.add_read_context("read_ctx",
ReadContextBuilder().\
set_summary("Sample read context").\
set_description("Sample description").\
add_required_hook(Hook.Fast).\
add_read_parameter("key", True, "description")
)
[ ]:
sess.list_read_context()
Open and read data based on read context#
[ ]:
capsule = sess.load_capsule(path=file_name, read_context="read_ctx")
Retrieve the data as a langchain retriever#
[ ]:
retriever = capsule.data_as(dt=Datatype.LangchainRetriever)
Retrieve some data from the retriever#
[ ]:
retriever._get_relevant_documents(query="Amanda Jordan", run_manager=None)
Create a gpt-3.5 qa and test with langchain retriever#
[ ]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
chatbot = ConversationalRetrievalChain.from_llm(ChatOpenAI(model='gpt-3.5-turbo'), retriever=retriever)
resp = chatbot({'question': "Give me details about Amanda Jordan", 'chat_history': ""})
print(resp["answer"])
Create a new read context with rules to redact data#
[ ]:
res = sess.create_data_policy("my_data_policy", "sample description")
policy_id = res.policy_id
email_rule = sess.update_data_policy_rules(
policy_id=policy_id,
rules=DataPolicyRuleChangesBuilder().add_rule(
NewDataPolicyRuleBuilder(comment="Deny", effect=RuleEffect.REDACT, priority=10).add_clause(
clause=DataPolicyClauseBuilder(ClauseOperator.AnyOf).add_tag(
TagExpressionBuilder()
.set_name("tag.antimatter.io/pii/email_address")
.set_operator(Operator.Exists)
)
)
),
)
sess.set_data_policy_binding(
policy_id=policy_id,
default_attachment=Attachment.NotAttached,
read_contexts=[("read_ctx", Attachment.Attached)],
)
[ ]:
cc_rule = sess.update_data_policy_rules(
policy_id=policy_id,
rules=DataPolicyRuleChangesBuilder().add_rule(
NewDataPolicyRuleBuilder(comment="Deny", effect=RuleEffect.REDACT, priority=20).add_clause(
clause=DataPolicyClauseBuilder(ClauseOperator.AnyOf).add_tag(
TagExpressionBuilder()
.set_name("tag.antimatter.io/pii/credit_card")
.set_operator(Operator.Exists)
)
)
),
)
[ ]:
sess.describe_read_context("read_ctx")
[ ]:
sess.describe_data_policy(policy_id)
Materialize the data with the new rules for redaction#
[ ]:
capsule = sess.load_capsule(path=file_name, read_context="read_ctx")
[ ]:
df = capsule.data_as(dt=Datatype.PandasDataframe)
[ ]:
df
Use RAG qa with new redacted context and it’s materialized retriever#
[ ]:
retriever = capsule.data_as(dt=Datatype.LangchainRetriever)
retriever._get_relevant_documents(query="Amanda Jordan", run_manager=None)
chatbot = ConversationalRetrievalChain.from_llm(ChatOpenAI(model='gpt-3.5-turbo'), retriever=retriever)
[ ]:
resp = chatbot({'question': "Give me details about Amanda Jordan", 'chat_history': ""})
print(resp["answer"])
Remove email redaction from the rule#
[ ]:
sess.delete_data_policy_rule(policy_id, email_rule.new_rules[0])
[ ]:
sess.describe_data_policy(policy_id)
Read the data with the new redaction rule#
[ ]:
capsule = sess.load_capsule(path=file_name, read_context="read_ctx")
[ ]:
retriever = capsule.data_as(dt=Datatype.LangchainRetriever)
retriever._get_relevant_documents(query="Amanda Jordan", run_manager=None)
[ ]:
chatbot = ConversationalRetrievalChain.from_llm(ChatOpenAI(model='gpt-3.5-turbo'), retriever=retriever)
resp = chatbot({'question': "Give me details about Amanda Jordan", 'chat_history': ""})
print(resp["answer"])