0% found this document useful (0 votes)

20 views

Turn_CSV_data_into_Text2SQL_agent

Uploaded by

Avinash Reddy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

20 views

Turn_CSV_data_into_Text2SQL_agent

Uploaded by

Avinash Reddy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 9

14_text2sql_agent

October 8, 2024

1 Text2SQL Agent to Interact with CSV Data

1.1 System Architecture
Think about it as an agent with a set of tools such as search_cache() and generate_SQL_query(),
and run_sql_query().

1.2 Data Ingestion Pipeline

1. Read CSV

1
2. Create Database schema
3. Create a table
4. Load table with CSV data

[1]: import pandas as pd

import sqlite3

def csv_to_sqlite(csv_file, db_name, table_name):

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file)

# Connect to the SQLite database (it will create the database file if it␣
↪doesn't exist)

conn = sqlite3.connect(db_name)
cursor = conn.cursor()

# Infer the schema based on the DataFrame columns and data types
def create_table_from_df(df, table_name):
# Get column names and types
col_types = []
for col in df.columns:
dtype = df[col].dtype
if dtype == 'int64':
col_type = 'INTEGER'
elif dtype == 'float64':
col_type = 'REAL'
else:
col_type = 'TEXT'
col_types.append(f'"{col}" {col_type}')

# Create the table schema

col_definitions = ", ".join(col_types)
create_table_query = f'CREATE TABLE IF NOT EXISTS {table_name}␣
↪({col_definitions});'

# print(create_table_query)

# Execute the table creation query

cursor.execute(create_table_query)
print(f"Table '{table_name}' created with schema: {col_definitions}")

# Create table schema

create_table_from_df(df, table_name)

# Insert CSV data into the SQLite table

df.to_sql(table_name, conn, if_exists='replace', index=False)

# Commit and close the connection

2
conn.commit()
conn.close()
print(f"Data loaded into '{table_name}' table in '{db_name}' SQLite␣
↪database.")

csv_file = "movies.csv"
db_name = "movies_db.db"
table_name = "movies"
csv_to_sqlite(csv_file, db_name, table_name)

Table 'movies' created with schema: "Movie" TEXT, "LeadStudio" TEXT,

"RottenTomatoes" REAL, "AudienceScore" REAL, "Story" TEXT, "Genre" TEXT,
"TheatersOpenWeek" REAL, "OpeningWeekend" REAL, "BOAvgOpenWeekend" REAL,
"DomesticGross" REAL, "ForeignGross" REAL, "WorldGross" REAL, "Budget" REAL,
"Profitability" REAL, "OpenProfit" REAL, "Year" INTEGER
Data loaded into 'movies' table in 'movies_db.db' SQLite database.

[2]: def run_sql_query(db_name, query):

"""
Executes a SQL query on a SQLite database and returns the results.

Args:
db_name (str): The name of the SQLite database file.
query (str): The SQL query to run.

Returns:
list: Query result as a list of tuples, or an empty list if no results␣
↪or error occurred.

"""
try:
# Connect to the SQLite database
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

# Execute the SQL query

cursor.execute(query)

# Fetch all results

results = cursor.fetchall()

# Close the connection

conn.close()

# Return results or an empty list if no results were found

return results if results else []

except sqlite3.Error as e:

3
print(f"An error occurred while executing the query: {e}")
return []

[3]: query = f"SELECT count(*) FROM {table_name};"

results = run_sql_query(db_name, query)

if results:
for row in results:
print(row)

(970,)

1.3 Ask Natural Language Questions

[24]: import openai
import faiss
import numpy as np
import os
from openai import OpenAI
from litellm import completion
from IPython.display import Markdown, display

[5]: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

client = OpenAI(api_key=OPENAI_API_KEY)

# Initialize the FAISS index

dimension = 1536 # Dimension size for OpenAI embeddings (may vary by model)
index = faiss.IndexFlatL2(dimension) # L2 distance index

# Cache will hold (user_question, sql_query, response)

cache = []

[6]: # Helper function to get embeddings from OpenAI or any embedding model
def get_embeddings(text):
"""
Converts a text string into a vector embedding using OpenAI embeddings.

Args:
text (str): The text string to convert.

Returns:
np.array: A vector representation of the text.
"""
response = client.embeddings.create(input=text,␣
↪model="text-embedding-3-small")

embedding = np.array(response.data[0].embedding)
return embedding

4
[31]: def search_cache(question_embedding, threshold=0.1):
"""
Searches the FAISS index for a similar question.

Args:
question_embedding (np.array): The embedding of the user's question.
threshold (float): The similarity threshold for considering a hit.

Returns:
tuple: (sql_query, response) if a hit is found, otherwise None.
"""
if index.ntotal > 0:
distances, indices = index.search(np.array([question_embedding]), k=1)
# print(distances)
# print(indices)
# Check if the closest distance is below the threshold
if distances[0][0] < threshold:
cache_index = indices[0][0]
return cache[cache_index][1], cache[cache_index][2]
return None

[16]: def get_table_schema(db_name, table_name):

"""
Retrieves the schema (columns and data types) for a given table in the␣
↪SQLite database.

Args:
db_name (str): The name of the SQLite database file.
table_name (str): The name of the table.

Returns:
list: A list of tuples with column name, data type, and other info.
"""
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

# Use PRAGMA to get the table schema

cursor.execute(f"PRAGMA table_info({table_name});")
schema = cursor.fetchall()

conn.close()
return schema

table_name = 'movies'
schema = get_table_schema(db_name, table_name)
print(f"Schema for {table_name}:")
for col in schema:

5
print(col)

Schema for movies:

(0, 'Movie', 'TEXT', 0, None, 0)
(1, 'LeadStudio', 'TEXT', 0, None, 0)
(2, 'RottenTomatoes', 'REAL', 0, None, 0)
(3, 'AudienceScore', 'REAL', 0, None, 0)
(4, 'Story', 'TEXT', 0, None, 0)
(5, 'Genre', 'TEXT', 0, None, 0)
(6, 'TheatersOpenWeek', 'REAL', 0, None, 0)
(7, 'OpeningWeekend', 'REAL', 0, None, 0)
(8, 'BOAvgOpenWeekend', 'REAL', 0, None, 0)
(9, 'DomesticGross', 'REAL', 0, None, 0)
(10, 'ForeignGross', 'REAL', 0, None, 0)
(11, 'WorldGross', 'REAL', 0, None, 0)
(12, 'Budget', 'REAL', 0, None, 0)
(13, 'Profitability', 'REAL', 0, None, 0)
(14, 'OpenProfit', 'REAL', 0, None, 0)
(15, 'Year', 'INTEGER', 0, None, 0)

[25]: def generate_llm_prompt(table_name, table_schema):

"""
Generates a prompt to provide context about a table's schema for LLM to␣
↪convert natural language to SQL.

Args:
table_name (str): The name of the table.
table_schema (list): A list of tuples where each tuple contains␣
↪information about the columns in the table.

Returns:
str: The generated prompt to be used by the LLM.
"""
prompt = f"""You are an expert in writing SQL queries for relational␣
↪databases.

You will be provided with a database schema and a natural

language question, and your task is to generate an accurate SQL query.

The database has a table named '{table_name}' with the following schema:
↪\n\n"""

prompt += "Columns:\n"

for col in table_schema:

column_name = col[1]
column_type = col[2]
prompt += f"- {column_name} ({column_type})\n"

6
prompt += "\nPlease generate a SQL query based on the following natural␣
↪language question. ONLY return the SQL query."

return prompt

table_name = "movies"
schema = get_table_schema(db_name, table_name)
# Generate the prompt
llm_prompt = generate_llm_prompt(table_name, schema)
print(llm_prompt)

You are an expert in writing SQL queries for relational databases.

You will be provided with a database schema and a natural
language question, and your task is to generate an accurate SQL query.

The database has a table named 'movies' with the following schema:

Columns:
- Movie (TEXT)
- LeadStudio (TEXT)
- RottenTomatoes (REAL)
- AudienceScore (REAL)
- Story (TEXT)
- Genre (TEXT)
- TheatersOpenWeek (REAL)
- OpeningWeekend (REAL)
- BOAvgOpenWeekend (REAL)
- DomesticGross (REAL)
- ForeignGross (REAL)
- WorldGross (REAL)
- Budget (REAL)
- Profitability (REAL)
- OpenProfit (REAL)
- Year (INTEGER)

Please generate a SQL query based on the following natural language question.
ONLY return the SQL query.

[26]: def handle_user_question(user_question):

"""
Handles the user's question by first searching the cache, and if there's no␣
↪hit, generating a SQL query and response.

Args:
user_question (str): The user's natural language question.

7
Returns:
list: The response to the user's question.
"""
# Convert the user's question to an embedding
question_embedding = get_embeddings(user_question)

# Step 1: Search cache for similar questions

cache_hit = search_cache(question_embedding)
if cache_hit:
sql_query, response = cache_hit
print(f"Cache hit! SQL Query: {sql_query}")
return response

# Step 2: No hit, go to LLM for SQL generation

print("Cache miss! Generating SQL from LLM...")
sql_query = generate_sql_query(user_question)

# Step 3: Run the SQL query on the database

response = run_sql_query(db_name, sql_query)

# Step 4: Store question, SQL, and response in cache

cache.append((user_question, sql_query, response))
index.add(np.array([question_embedding])) # Add question embedding to␣
↪FAISS index

return response

[27]: def generate_sql_query(question):

table_name = 'movies'
db_name = 'movies_db.db'
table_schema = get_table_schema(db_name, table_name)
llm_prompt = generate_llm_prompt(table_name, table_schema)
user_prompt = """Question: {question}"""
response = completion(
api_key=OPENAI_API_KEY,
model="gpt-4o-mini",
messages=[
{"content": llm_prompt.format(table_name=table_name),"role":␣
↪"system"},

{"content": user_prompt.format(question=question),"role": "user"}],

max_tokens=1000
)
answer = response.choices[0].message.content
display(Markdown(answer))
query = answer.replace("```sql", "").replace("```", "")
query = query.strip()
return query

8
[37]: # question = "total number of movies are made by Warner Bros company in year␣
↪2008?"

# question = "how many movies have RottenTomatoes scores lower than 85?"
question = "how many movies with action genre are in the database"
handle_user_question(question)

Cache miss! Generating SQL from LLM…

SELECT COUNT(*) AS ActionMovieCount
FROM movies
WHERE Genre = 'Action';

[37]: [(166,)]

[38]: cache

[38]: [('total number of movies are made by Warner Bros company in year 2008?',
"SELECT COUNT(*) \nFROM movies \nWHERE LeadStudio = 'Warner Bros' AND Year =
2008;",
[(21,)]),
('how many movies have RottenTomatoes scores greater than 85?',
'SELECT COUNT(*) \nFROM movies \nWHERE RottenTomatoes > 85;',
[(120,)]),
('how many movies have RottenTomatoes scores lower than 85?',
'SELECT COUNT(*) \nFROM movies \nWHERE RottenTomatoes < 85;',
[(782,)]),
('how many movies with action genre are in the database',
"SELECT COUNT(*) AS ActionMovieCount\nFROM movies\nWHERE Genre = 'Action';",
[(166,)])]

[ ]:

NB 9
No ratings yet
NB 9
29 pages
Ex01-Quick Start
No ratings yet
Ex01-Quick Start
2 pages
sqlite3 — DB-API 2.0 interface for SQLite databases — Python 3.12.0 documentation
No ratings yet
sqlite3 — DB-API 2.0 interface for SQLite databases — Python 3.12.0 documentation
31 pages
App Assignment 5
No ratings yet
App Assignment 5
7 pages
Module 3 Notes
No ratings yet
Module 3 Notes
45 pages
docs_python_org_3_library_sqlite3_html
No ratings yet
docs_python_org_3_library_sqlite3_html
35 pages
Unit-7 Working With Databases
No ratings yet
Unit-7 Working With Databases
43 pages
SQLiteUsingPython
No ratings yet
SQLiteUsingPython
49 pages
Practical Assignment3
No ratings yet
Practical Assignment3
2 pages
Comp Rec Qs
No ratings yet
Comp Rec Qs
4 pages
Data Visualization Using Pyplot
No ratings yet
Data Visualization Using Pyplot
14 pages
Pydblite - Documentation
No ratings yet
Pydblite - Documentation
35 pages
SBL Python LAB Manual by NY Expt. No. 6.docx
No ratings yet
SBL Python LAB Manual by NY Expt. No. 6.docx
5 pages
Lec 16 BB
No ratings yet
Lec 16 BB
24 pages
Unit-7 Part-1 Working With Databases
No ratings yet
Unit-7 Part-1 Working With Databases
43 pages
1 Format For Sqlite Commands: Csca20 Worksheet - Databases
No ratings yet
1 Format For Sqlite Commands: Csca20 Worksheet - Databases
11 pages
Python SQLite Tutorial - The Ultimate Guide
No ratings yet
Python SQLite Tutorial - The Ultimate Guide
12 pages
12.6. Sqlite3 - DB-API 2.0 Interface For SQLite Databases - Python 3.6
100% (2)
12.6. Sqlite3 - DB-API 2.0 Interface For SQLite Databases - Python 3.6
24 pages
Pythonsheets.com SQLAlchemy
No ratings yet
Pythonsheets.com SQLAlchemy
38 pages
Sqlite3 A
No ratings yet
Sqlite3 A
25 pages
ANL252 SU6 Jul2022
No ratings yet
ANL252 SU6 Jul2022
51 pages
Unit 4 - 2 Tkinter database app
No ratings yet
Unit 4 - 2 Tkinter database app
14 pages
Data Base Connectvity in Python
No ratings yet
Data Base Connectvity in Python
19 pages
Interrupt: (String1, String2) : String1 String2
No ratings yet
Interrupt: (String1, String2) : String1 String2
18 pages
asdasd
No ratings yet
asdasd
14 pages
Assignment15Utkarsh
No ratings yet
Assignment15Utkarsh
12 pages
DEFCON-27-Omer-Gull-SELECT-code-execution-FROM-USING-SQLite
No ratings yet
DEFCON-27-Omer-Gull-SELECT-code-execution-FROM-USING-SQLite
75 pages
Relationship Between Two Tables
No ratings yet
Relationship Between Two Tables
6 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Tutorialfor SQliteusing Python
No ratings yet
Tutorialfor SQliteusing Python
8 pages
Importing Data in Python I: Introduction To Relational Databases
No ratings yet
Importing Data in Python I: Introduction To Relational Databases
33 pages
Computer Science ( Shreya and Sasikala )
No ratings yet
Computer Science ( Shreya and Sasikala )
13 pages
UNIT-5 14M
No ratings yet
UNIT-5 14M
26 pages
Question Bank
No ratings yet
Question Bank
2 pages
Dbms Lab El Report
No ratings yet
Dbms Lab El Report
20 pages
PythonDatabase
No ratings yet
PythonDatabase
3 pages
A SQLite Tutorial With Python PDF
100% (1)
A SQLite Tutorial With Python PDF
14 pages
12th std computer science HANDS on PRACTICE chapter-15
No ratings yet
12th std computer science HANDS on PRACTICE chapter-15
7 pages
JOURNAL ANSWERSHEET
No ratings yet
JOURNAL ANSWERSHEET
18 pages
Simplesqlite Readthedocs Io en Latest
No ratings yet
Simplesqlite Readthedocs Io en Latest
60 pages
Rishit Sharma S22 108 Python EXP6
No ratings yet
Rishit Sharma S22 108 Python EXP6
1 page
ML PGM
No ratings yet
ML PGM
8 pages
PY Mod 4
No ratings yet
PY Mod 4
20 pages
Excel Database Management
No ratings yet
Excel Database Management
3 pages
Python Details PDF
No ratings yet
Python Details PDF
3 pages
SQL With Python Guide
No ratings yet
SQL With Python Guide
17 pages
Step 8 Chapter1
No ratings yet
Step 8 Chapter1
29 pages
Databases Python
No ratings yet
Databases Python
44 pages
SQLITE3 FOR PYTHON
No ratings yet
SQLITE3 FOR PYTHON
7 pages
DBMS_EXP7
No ratings yet
DBMS_EXP7
3 pages
IP Project Saleha
No ratings yet
IP Project Saleha
34 pages
12th Practical List 2025-2026
No ratings yet
12th Practical List 2025-2026
4 pages
DAP_Module3
No ratings yet
DAP_Module3
42 pages
Shashank Dbp (2)
No ratings yet
Shashank Dbp (2)
20 pages
ANIRUTH SRINIVASAN Board Praticals CS
No ratings yet
ANIRUTH SRINIVASAN Board Praticals CS
9 pages
exp21_exp23 (11 files merged)
No ratings yet
exp21_exp23 (11 files merged)
29 pages
301 Connection code
No ratings yet
301 Connection code
2 pages
database_programming
No ratings yet
database_programming
16 pages
DBMS Lab Manual
From Everand
DBMS Lab Manual
Jitendra Patel
1.5/5 (3)
Simplifying Data Science With Python
From Everand
Simplifying Data Science With Python
Billy David millican
No ratings yet
RBAC_Configuration_Management_v1_1723989259
No ratings yet
RBAC_Configuration_Management_v1_1723989259
4 pages
Regularization_for_Neural_Networks_1718966083
No ratings yet
Regularization_for_Neural_Networks_1718966083
9 pages
snowflake_cortex
No ratings yet
snowflake_cortex
9 pages
AI Algorithms
No ratings yet
AI Algorithms
12 pages
Types_of_agents
No ratings yet
Types_of_agents
16 pages
AI_Algorithms_Explained_To_Kids_1717055132
No ratings yet
AI_Algorithms_Explained_To_Kids_1717055132
10 pages
Wavelets Meet Large Language Models
No ratings yet
Wavelets Meet Large Language Models
16 pages
OpenAI_o1_Technical_Summary_and_Examples
No ratings yet
OpenAI_o1_Technical_Summary_and_Examples
20 pages
Vector_Databases
No ratings yet
Vector_Databases
35 pages
TypeofChunking
No ratings yet
TypeofChunking
11 pages
20 Types Prompting Styles
No ratings yet
20 Types Prompting Styles
22 pages
Descriptive Stats
No ratings yet
Descriptive Stats
83 pages
How LLMs Collaborate With Multi Agent Setup
No ratings yet
How LLMs Collaborate With Multi Agent Setup
6 pages
SF Dataloading Commands
No ratings yet
SF Dataloading Commands
4 pages
Example of Use Event Task For The PC WORX 5.20 Control System
No ratings yet
Example of Use Event Task For The PC WORX 5.20 Control System
8 pages
Unit 1 Part 2-2 Segmentation
No ratings yet
Unit 1 Part 2-2 Segmentation
35 pages
Backtracking and Branch and Bound Final
No ratings yet
Backtracking and Branch and Bound Final
63 pages
MAD Unit4
No ratings yet
MAD Unit4
125 pages
Cache Frequently Asked Questions
No ratings yet
Cache Frequently Asked Questions
36 pages
Cs All Scenario Questions (Solution)
No ratings yet
Cs All Scenario Questions (Solution)
18 pages
ABAP System Fields
No ratings yet
ABAP System Fields
4 pages
Mon God Band Mongoose
No ratings yet
Mon God Band Mongoose
27 pages
DSA Lab 2
No ratings yet
DSA Lab 2
12 pages
Module5 - Software Testing
No ratings yet
Module5 - Software Testing
279 pages
Java Program Upadte
No ratings yet
Java Program Upadte
36 pages
C# Interview Questions
0% (1)
C# Interview Questions
5 pages
QDK Arm-Iar At91sam7s
No ratings yet
QDK Arm-Iar At91sam7s
29 pages
ADA GTU Study Material Presentations Unit-2 14082021030333PM
No ratings yet
ADA GTU Study Material Presentations Unit-2 14082021030333PM
118 pages
Arrays and Functions
No ratings yet
Arrays and Functions
34 pages
Chapter 7: Single-Dimensional Arrays: CS1: Java Programming Colorado State University
No ratings yet
Chapter 7: Single-Dimensional Arrays: CS1: Java Programming Colorado State University
112 pages
SQL Server Standards-Naming Convention PDF
No ratings yet
SQL Server Standards-Naming Convention PDF
24 pages
SAP ABAP Central - RFC Gateway Security, Part 1 - Basic Understanding
No ratings yet
SAP ABAP Central - RFC Gateway Security, Part 1 - Basic Understanding
5 pages
Dynamic Dashboards 9.1 Slides
No ratings yet
Dynamic Dashboards 9.1 Slides
78 pages
Managing an S18 DRC violation if you have a custom OCC (and not the Synopsys DW OCC)
No ratings yet
Managing an S18 DRC violation if you have a custom OCC (and not the Synopsys DW OCC)
2 pages
Report2 - Project Management Plan
No ratings yet
Report2 - Project Management Plan
6 pages
BTNotification Log
No ratings yet
BTNotification Log
3 pages
Pyro Tutorial
No ratings yet
Pyro Tutorial
23 pages
bugreport-topaz_ru-AQ3A.240829.003-2025-02-28-10-38-38-dumpstate_log-3717
No ratings yet
bugreport-topaz_ru-AQ3A.240829.003-2025-02-28-10-38-38-dumpstate_log-3717
25 pages
Flowchart - Wikipedia - The ..
No ratings yet
Flowchart - Wikipedia - The ..
5 pages
TD-CAD Tasklist 20151113
No ratings yet
TD-CAD Tasklist 20151113
534 pages
Beginner Tutorials: Salesforce Coding Lessons For The 99%
No ratings yet
Beginner Tutorials: Salesforce Coding Lessons For The 99%
85 pages
p475 Healy PDF
No ratings yet
p475 Healy PDF
8 pages
ES101 CP Lab Manual
No ratings yet
ES101 CP Lab Manual
124 pages
Hardware VS Software
0% (1)
Hardware VS Software
4 pages

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Turn_CSV_data_into_Text2SQL_agent

Uploaded by

Turn_CSV_data_into_Text2SQL_agent

Uploaded by

14_text2sql_agent

1 Text2SQL Agent to Interact with CSV Data

1.2 Data Ingestion Pipeline

[1]: import pandas as pd

def csv_to_sqlite(csv_file, db_name, table_name):

# Create the table schema

# Execute the table creation query

# Create table schema

# Insert CSV data into the SQLite table

# Commit and close the connection

Table 'movies' created with schema: "Movie" TEXT, "LeadStudio" TEXT,

[2]: def run_sql_query(db_name, query):

# Execute the SQL query

# Fetch all results

# Close the connection

# Return results or an empty list if no results were found

[3]: query = f"SELECT count(*) FROM {table_name};"

1.3 Ask Natural Language Questions

[5]: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# Initialize the FAISS index

# Cache will hold (user_question, sql_query, response)

[16]: def get_table_schema(db_name, table_name):

# Use PRAGMA to get the table schema

Schema for movies:

[25]: def generate_llm_prompt(table_name, table_schema):

You will be provided with a database schema and a natural

for col in table_schema:

You are an expert in writing SQL queries for relational databases.

[26]: def handle_user_question(user_question):

# Step 1: Search cache for similar questions

# Step 2: No hit, go to LLM for SQL generation

# Step 3: Run the SQL query on the database

# Step 4: Store question, SQL, and response in cache

[27]: def generate_sql_query(question):

{"content": user_prompt.format(question=question),"role": "user"}],

Cache miss! Generating SQL from LLM…

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.