Data formats supported#
This notebook demonstrates loading data from and to various formats supported by the library. This includes popular formats such as pandas dataframes, scalars, list of dictionaries, pytorch dataloader, dictionaries, etc. Users will learn how to handle different data formats seamlessly within the library environment.
[ ]:
!pip install "antimatter[all]"
[1]:
import os
from antimatter import new_domain
from antimatter.builders import WriteContextBuilder, ReadContextBuilder, WriteContextHookMode
from antimatter.datatype.datatypes import Datatype
from antimatter.datatype.infer import infer_datatype
Register a domain and create a read/write context#
[2]:
# Either create a new domain or use an existing one
if True:
sess = new_domain("<my_email_address>")
print ("domain: %s" % (sess.domain_id))
print(f"sess = Session.from_api_key(domain_id='{sess.domain_id}', api_key='{sess.api_key}')")
else:
sess = Session.from_api_key(domain_id='<domain_id>', api_key='<api_key>')
file_name = "/tmp/testdata.capsule"
domain: dm-Erj1p7qwigE
Load the data#
DF#
[3]:
# Load dataset
import pandas as pd
data = [
{"id":1,"first_name":"Amanda","last_name":"Jordan","email":"[email protected]","gender":"Female","ip_address":"1.197.201.2","cc":"6759521864920116","country":"Indonesia","birthdate":"3\\/8\\/1971","salary":49756.53,"title":"Internal Auditor","comments":"Hello friends, my name is Alice Johnson and I just turned 29 years old! \\ud83c\\udf89 I am looking forward to connecting with all of you. Feel free to drop me a line at [email protected] or call me at 415-123-4567."},
{"id":2,"first_name":"Albert","last_name":"Freeman","email":"[email protected]","gender":"Male","ip_address":"218.111.175.34","cc":"","country":"Canada","birthdate":"1\\/16\\/1968","salary":150280.17,"title":"Accountant IV","comments":"Customer feedback: I recently visited your store at 5678 Pine Avenue, Dallas, TX 75201. My name is Jane Doe, age 43. I had a wonderful experience and the staff was very friendly. You can reach out to me at [email protected] for any further details."},
{"id":3,"first_name":"Evelyn","last_name":"Morgan","email":"[email protected]","gender":"Female","ip_address":"7.161.136.94","cc":"6767119071901597","country":"Russia","birthdate":"2\\/1\\/1960","salary":144972.51,"title":"Structural Engineer","comments":"Booking Confirmation: Thank you, David Smith (DOB: 01\\/12\\/1978) for booking with us. We have received your payment through the credit card ending with 1234. Your booking ID is #67890. Please save this email for your records. For any queries, contact us at [email protected]."},
{"id":4,"first_name":"Denise","last_name":"Riley","email":"[email protected]","gender":"Female","ip_address":"140.35.109.83","cc":"3576031598965625","country":"China","birthdate":"4\\/8\\/1997","salary":90263.05,"title":"Senior Cost Accountant","comments":"Hi, I am Emily Brown, aged 33, and I recently moved to 123 Harmony Lane, Los Angeles, CA 90001. I am looking to make new friends in the neighborhood. Feel free to call me at 323-987-6543 or email me at [email protected]."},
{"id":5,"first_name":"Carlos","last_name":"Burns","email":"[email protected]","gender":"","ip_address":"169.113.235.40","cc":"5602256255204850","country":"South Africa","birthdate":"","salary":123.0,"title":"","comments":"Urgent: My name is Sarah Lee, my SSN is 512-34-6789. I noticed some unauthorized transactions on my credit card number ending in 5678. I am 39 years old, and I urgently need assistance with this. Please contact me at 213-123-9876 or [email protected]."},
{"id":6,"first_name":"Kathryn","last_name":"White","email":"[email protected]","gender":"Female","ip_address":"195.131.81.179","cc":"3583136326049310","country":"Indonesia","birthdate":"2\\/25\\/1983","salary":69227.11,"title":"Account Executive","comments":"Hello, I\'m Mark Thompson. I\\u2019m 36 years old, residing at 3456 Elm Street, Austin, TX 78701. If anyone nearby wants to connect, feel free to email me at [email protected] or call 512-345-6789."},
{"id":7,"first_name":"Samuel","last_name":"Holmes","email":"[email protected]","gender":"Male","ip_address":"232.234.81.197","cc":"3582641366974690","country":"Portugal","birthdate":"12\\/18\\/1987","salary":14247.62,"title":"Senior Financial Analyst","comments":"Hi, my name is Michael Martinez, I am 40 years old, and my SSN is 543-21-6789. Please contact me regarding my account details at 415-234-5678 or [email protected]."},
{"id":8,"first_name":"Harry","last_name":"Howell","email":"[email protected]","gender":"Male","ip_address":"91.235.51.73","cc":"","country":"Bosnia and Herzegovina","birthdate":"3\\/1\\/1962","salary":186469.43,"title":"Web Developer IV","comments":"Customer Feedback: I\'m Linda White, 32 years old. I had a great experience shopping online at your store. Reach me at 456 Elm Street, Phoenix, AZ 85001 or [email protected] for further feedback."},
{"id":9,"first_name":"Jose","last_name":"Foster","email":"[email protected]","gender":"Male","ip_address":"132.31.53.61","cc":"","country":"South Korea","birthdate":"3\\/27\\/1992","salary":231067.84,"title":"Software Test Engineer I","comments":"Hey, it\\u2019s Lisa Davis, I am 28 years old. I noticed a discrepancy in my latest bill. My address is 789 Pine Street, Miami, FL 33101. Please, get in touch at [email protected] or 305-123-4567."},
{"id":10,"first_name":"Emily","last_name":"Stewart","email":"[email protected]","gender":"Female","ip_address":"143.28.251.245","cc":"3574254110301671","country":"Nigeria","birthdate":"1\\/28\\/1997","salary":27234.28,"title":"Health Coach IV","comments":"Support Request: My name is Joseph Johnson. I am facing issues with my recent purchase. Reach me at 123-45-6789 or at [email protected] for order number #56789 details."}
]
df = pd.DataFrame(data)
df.head()
[3]:
id | first_name | last_name | gender | ip_address | cc | country | birthdate | salary | title | comments | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Amanda | Jordan | [email protected] | Female | 1.197.201.2 | 6759521864920116 | Indonesia | 3\/8\/1971 | 49756.53 | Internal Auditor | Hello friends, my name is Alice Johnson and I ... |
1 | 2 | Albert | Freeman | [email protected] | Male | 218.111.175.34 | Canada | 1\/16\/1968 | 150280.17 | Accountant IV | Customer feedback: I recently visited your sto... | |
2 | 3 | Evelyn | Morgan | [email protected] | Female | 7.161.136.94 | 6767119071901597 | Russia | 2\/1\/1960 | 144972.51 | Structural Engineer | Booking Confirmation: Thank you, David Smith (... |
3 | 4 | Denise | Riley | [email protected] | Female | 140.35.109.83 | 3576031598965625 | China | 4\/8\/1997 | 90263.05 | Senior Cost Accountant | Hi, I am Emily Brown, aged 33, and I recently ... |
4 | 5 | Carlos | Burns | [email protected] | 169.113.235.40 | 5602256255204850 | South Africa | 123.00 | Urgent: My name is Sarah Lee, my SSN is 512-34... |
Scalar data#
[4]:
scalar_data = df['comments'].iloc[0]
scalar_data
[4]:
'Hello friends, my name is Alice Johnson and I just turned 29 years old! \\ud83c\\udf89 I am looking forward to connecting with all of you. Feel free to drop me a line at [email protected] or call me at 415-123-4567.'
List of Dicts#
[5]:
list_dict_data = df.to_dict('records')
list_dict_data[:2]
[5]:
[{'id': 1,
'first_name': 'Amanda',
'last_name': 'Jordan',
'email': '[email protected]',
'gender': 'Female',
'ip_address': '1.197.201.2',
'cc': '6759521864920116',
'country': 'Indonesia',
'birthdate': '3\\/8\\/1971',
'salary': 49756.53,
'title': 'Internal Auditor',
'comments': 'Hello friends, my name is Alice Johnson and I just turned 29 years old! \\ud83c\\udf89 I am looking forward to connecting with all of you. Feel free to drop me a line at [email protected] or call me at 415-123-4567.'},
{'id': 2,
'first_name': 'Albert',
'last_name': 'Freeman',
'email': '[email protected]',
'gender': 'Male',
'ip_address': '218.111.175.34',
'cc': '',
'country': 'Canada',
'birthdate': '1\\/16\\/1968',
'salary': 150280.17,
'title': 'Accountant IV',
'comments': 'Customer feedback: I recently visited your store at 5678 Pine Avenue, Dallas, TX 75201. My name is Jane Doe, age 43. I had a wonderful experience and the staff was very friendly. You can reach out to me at [email protected] for any further details.'}]
Dict#
[6]:
dict_data = list_dict_data[0]
dict_data
[6]:
{'id': 1,
'first_name': 'Amanda',
'last_name': 'Jordan',
'email': '[email protected]',
'gender': 'Female',
'ip_address': '1.197.201.2',
'cc': '6759521864920116',
'country': 'Indonesia',
'birthdate': '3\\/8\\/1971',
'salary': 49756.53,
'title': 'Internal Auditor',
'comments': 'Hello friends, my name is Alice Johnson and I just turned 29 years old! \\ud83c\\udf89 I am looking forward to connecting with all of you. Feel free to drop me a line at [email protected] or call me at 415-123-4567.'}
Pytorch data loader#
[7]:
from torch.utils.data import DataLoader
dl = DataLoader(list_dict_data)
dl
[7]:
<torch.utils.data.dataloader.DataLoader at 0x290650850>
Convert all the formats#
The blocks below will try to convert all the above data formats to each other.
[8]:
frm = to = [Datatype.PandasDataframe, Datatype.Scalar, Datatype.DictList, Datatype.PytorchDataLoader, Datatype.Dict]
Convert df to all the other formats#
[9]:
sess.encapsulate(data=df, write_context="default", path=file_name)
capsule = sess.load_capsule(path=file_name, read_context="default")
read_data = capsule.data()
# df & read_data should be the same.
assert type(df) == type(read_data)
for t in to:
print(f"Converting from {type(df)} to {t}")
c = capsule.data_as(t)
# c & read_data should be the same.
assert infer_datatype(c) == t
Converting from <class 'pandas.core.frame.DataFrame'> to Datatype.PandasDataframe
Converting from <class 'pandas.core.frame.DataFrame'> to Datatype.Scalar
Converting from <class 'pandas.core.frame.DataFrame'> to Datatype.DictList
Converting from <class 'pandas.core.frame.DataFrame'> to Datatype.PytorchDataLoader
Converting from <class 'pandas.core.frame.DataFrame'> to Datatype.Dict
Convert list of dicts to all the other formats#
[10]:
sess.encapsulate(data=list_dict_data, write_context="default", path=file_name)
capsule = sess.load_capsule(path=file_name, read_context="default")
read_data = capsule.data()
# list_dict_data & read_data should be the same.
assert type(list_dict_data) == type(read_data)
for t in to:
print(f"Converting from {type(list_dict_data)} to {t}")
c = capsule.data_as(t)
# c & read_data should be the same.
assert infer_datatype(c) == t
Converting from <class 'list'> to Datatype.PandasDataframe
Converting from <class 'list'> to Datatype.Scalar
Converting from <class 'list'> to Datatype.DictList
Converting from <class 'list'> to Datatype.PytorchDataLoader
Converting from <class 'list'> to Datatype.Dict
Convert scalar to all the other formats#
[11]:
sess.encapsulate(data=scalar_data, write_context="default", path=file_name)
capsule = sess.load_capsule(path=file_name, read_context="default")
read_data = capsule.data()
# scalar_data & read_data should be the same.
assert type(scalar_data) == type(read_data)
for t in to:
print(f"Converting from {type(scalar_data)} to {t}")
c = capsule.data_as(t)
# c & read_data should be the same.
assert infer_datatype(c) == t
Converting from <class 'str'> to Datatype.PandasDataframe
Converting from <class 'str'> to Datatype.Scalar
Converting from <class 'str'> to Datatype.DictList
Converting from <class 'str'> to Datatype.PytorchDataLoader
Converting from <class 'str'> to Datatype.Dict
Convert dictionary to all the other formats#
[12]:
sess.encapsulate(data=dict_data, write_context="default", path=file_name)
capsule = sess.load_capsule(path=file_name, read_context="default")
read_data = capsule.data()
# dict_data & read_data should be the same.
assert type(dict_data) == type(read_data)
for t in to:
print(f"Converting from {type(dict_data)} to {t}")
c = capsule.data_as(t)
# c & read_data should be the same.
assert infer_datatype(c) == t
Converting from <class 'dict'> to Datatype.PandasDataframe
Converting from <class 'dict'> to Datatype.Scalar
Converting from <class 'dict'> to Datatype.DictList
Converting from <class 'dict'> to Datatype.PytorchDataLoader
Converting from <class 'dict'> to Datatype.Dict
Convert pytorch dataloader to all the other formats#
[13]:
sess.encapsulate(data=dl, write_context="default", path=file_name)
capsule = sess.load_capsule(path=file_name, read_context="default")
read_data = capsule.data()
# dl & read_data should be the same.
assert type(dl) == type(read_data)
for t in to:
print(f"Converting from {type(dl)} to {t}")
c = capsule.data_as(t)
# c & read_data should be the same.
assert infer_datatype(c) == t
Converting from <class 'torch.utils.data.dataloader.DataLoader'> to Datatype.PandasDataframe
Converting from <class 'torch.utils.data.dataloader.DataLoader'> to Datatype.Scalar
Converting from <class 'torch.utils.data.dataloader.DataLoader'> to Datatype.DictList
Converting from <class 'torch.utils.data.dataloader.DataLoader'> to Datatype.PytorchDataLoader
Converting from <class 'torch.utils.data.dataloader.DataLoader'> to Datatype.Dict