An End-to-End RAG example using faiss retriver using langchain and openai gpt-3.5 for QA#

This notebook presents a comprehensive end-to-end example utilizing the library’s functionality. Specifically, it showcases how to use a RAG (Retrieval-Augmented Generation) model, powered by GPT-3.5, to retrieve information. This notebook provides insights into leveraging the library for complex use cases.

[ ]:
!pip install "antimatter[langchain]"
!pip install python-dotenv openai

Import openai key from a .env file.#

[ ]:
import dotenv
import os
dotenv.load_dotenv(os.path.join(os.getenv("HOME"), '.openai_env'))

Register a domain and create a read/write context#

[ ]:
import os
from antimatter import new_domain, Session
from antimatter.builders import *
from antimatter.datatype.datatypes import Datatype
[ ]:
# Either create a new domain or use an existing one
if True:
    sess = new_domain("<my_email_address>")
    print ("domain: %s" % (sess.domain_id))
    print(f"sess = Session.from_api_key(domain_id='{sess.domain_id}', api_key='{sess.api_key}')")
else:
    sess = Session.from_api_key(domain_id='<domain_id>', api_key='<api_key>')

file_name = "/tmp/testdata.capsule"

Add some facts to this domain#

Create a fact type called is_project_member with the attributes email and project. Add 2 facts to this type: - is_project_member(email="test@test.com", project="project1") - is_project_member(email="test2@test2.com", project="project2")

[ ]:
sess.add_fact_type(
    "is_project_member",
    description="Team membership",
    arguments={"email": "email of the member", "project": "name of the project"},
)

sess.add_fact(
    "is_project_member",
    "[email protected]",
    "project1",
)

sess.add_fact(
    "is_project_member",
    "[email protected]",
    "project2",
)
[ ]:
sess.list_facts('is_project_member')

Open a dataset#

[ ]:
# Load dataset
import pandas as pd

data = [
    {"id":1,"first_name":"Amanda","last_name":"Jordan","email":"[email protected]","gender":"Female","ip_address":"1.197.201.2","cc":"6759521864920116","country":"Indonesia","birthdate":"3\\/8\\/1971","salary":49756.53,"title":"Internal Auditor","comments":"Hello friends, my name is Alice Johnson and I just turned 29 years old! \\ud83c\\udf89 I am looking forward to connecting with all of you. Feel free to drop me a line at [email protected] or call me at 415-123-4567."},
    {"id":2,"first_name":"Albert","last_name":"Freeman","email":"[email protected]","gender":"Male","ip_address":"218.111.175.34","cc":"","country":"Canada","birthdate":"1\\/16\\/1968","salary":150280.17,"title":"Accountant IV","comments":"Customer feedback: I recently visited your store at 5678 Pine Avenue, Dallas, TX 75201. My name is Jane Doe, age 43. I had a wonderful experience and the staff was very friendly. You can reach out to me at [email protected] for any further details."},
    {"id":3,"first_name":"Evelyn","last_name":"Morgan","email":"[email protected]","gender":"Female","ip_address":"7.161.136.94","cc":"6767119071901597","country":"Russia","birthdate":"2\\/1\\/1960","salary":144972.51,"title":"Structural Engineer","comments":"Booking Confirmation: Thank you, David Smith (DOB: 01\\/12\\/1978) for booking with us. We have received your payment through the credit card ending with 1234. Your booking ID is #67890. Please save this email for your records. For any queries, contact us at [email protected]."},
]

df = pd.DataFrame(data)
df.head()

List and create write context#

[ ]:
sess.list_write_context()
[ ]:
# Create a new write context
sess.add_write_context(
    "write_ctx", WriteContextBuilder().\
        set_summary("Sample write context").\
        set_description("Sample description").\
        add_hook(Hook.Fast)
)
[ ]:
sess.list_write_context()

Encapsulate data using the write context#

[ ]:
df_capsule = sess.encapsulate(data=df, write_context="write_ctx", path=file_name)
[ ]:
!ls -lrtha /tmp/testdata.capsule

List & Create read contexts#

[ ]:
sess.list_read_context()
[ ]:
sess.add_read_context("read_ctx",
    ReadContextBuilder().\
        set_summary("Sample read context").\
        set_description("Sample description").\
        add_required_hook(Hook.Fast).\
        add_read_parameter("key", True, "description")
)
[ ]:
sess.list_read_context()

Open and read data based on read context#

[ ]:
capsule = sess.load_capsule(path=file_name, read_context="read_ctx")

Retrieve the data as a langchain retriever#

[ ]:
retriever = capsule.data_as(dt=Datatype.LangchainRetriever)

Retrieve some data from the retriever#

[ ]:
retriever._get_relevant_documents(query="Amanda Jordan", run_manager=None)

Create a gpt-3.5 qa and test with langchain retriever#

[ ]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
chatbot = ConversationalRetrievalChain.from_llm(ChatOpenAI(model='gpt-3.5-turbo'), retriever=retriever)
resp = chatbot({'question': "Give me details about Amanda Jordan", 'chat_history': ""})
print(resp["answer"])

Create a new read context with rules to redact data#

[ ]:
res = sess.create_data_policy("my_data_policy", "sample description")
policy_id = res.policy_id

email_rule = sess.update_data_policy_rules(
    policy_id=policy_id,
    rules=DataPolicyRuleChangesBuilder().add_rule(
        NewDataPolicyRuleBuilder(comment="Deny", effect=RuleEffect.REDACT, priority=10).add_clause(
            clause=DataPolicyClauseBuilder(ClauseOperator.AnyOf).add_tag(
                TagExpressionBuilder()
                .set_name("tag.antimatter.io/pii/email_address")
                .set_operator(Operator.Exists)
            )
        )
    ),
)

sess.set_data_policy_binding(
    policy_id=policy_id,
    default_attachment=Attachment.NotAttached,
    read_contexts=[("read_ctx", Attachment.Attached)],
)
[ ]:
cc_rule = sess.update_data_policy_rules(
    policy_id=policy_id,
    rules=DataPolicyRuleChangesBuilder().add_rule(
        NewDataPolicyRuleBuilder(comment="Deny", effect=RuleEffect.REDACT, priority=20).add_clause(
            clause=DataPolicyClauseBuilder(ClauseOperator.AnyOf).add_tag(
                TagExpressionBuilder()
                .set_name("tag.antimatter.io/pii/credit_card")
                .set_operator(Operator.Exists)
            )
        )
    ),
)
[ ]:
sess.describe_read_context("read_ctx")
[ ]:
sess.describe_data_policy(policy_id)

Materialize the data with the new rules for redaction#

[ ]:
capsule = sess.load_capsule(path=file_name, read_context="read_ctx")
[ ]:
df = capsule.data_as(dt=Datatype.PandasDataframe)
[ ]:
df

Use RAG qa with new redacted context and it’s materialized retriever#

[ ]:
retriever = capsule.data_as(dt=Datatype.LangchainRetriever)
retriever._get_relevant_documents(query="Amanda Jordan", run_manager=None)
chatbot = ConversationalRetrievalChain.from_llm(ChatOpenAI(model='gpt-3.5-turbo'), retriever=retriever)
[ ]:
resp = chatbot({'question': "Give me details about Amanda Jordan", 'chat_history': ""})
print(resp["answer"])

Remove email redaction from the rule#

[ ]:
sess.delete_data_policy_rule(policy_id, email_rule.new_rules[0])
[ ]:
sess.describe_data_policy(policy_id)

Read the data with the new redaction rule#

[ ]:
capsule = sess.load_capsule(path=file_name, read_context="read_ctx")
[ ]:
retriever = capsule.data_as(dt=Datatype.LangchainRetriever)
retriever._get_relevant_documents(query="Amanda Jordan", run_manager=None)
[ ]:
chatbot = ConversationalRetrievalChain.from_llm(ChatOpenAI(model='gpt-3.5-turbo'), retriever=retriever)
resp = chatbot({'question': "Give me details about Amanda Jordan", 'chat_history': ""})
print(resp["answer"])