Turn_CSV_data_into_Text2SQL_agent
Turn_CSV_data_into_Text2SQL_agent
October 8, 2024
1
2. Create Database schema
3. Create a table
4. Load table with CSV data
# Connect to the SQLite database (it will create the database file if it␣
↪doesn't exist)
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
# Infer the schema based on the DataFrame columns and data types
def create_table_from_df(df, table_name):
# Get column names and types
col_types = []
for col in df.columns:
dtype = df[col].dtype
if dtype == 'int64':
col_type = 'INTEGER'
elif dtype == 'float64':
col_type = 'REAL'
else:
col_type = 'TEXT'
col_types.append(f'"{col}" {col_type}')
# print(create_table_query)
2
conn.commit()
conn.close()
print(f"Data loaded into '{table_name}' table in '{db_name}' SQLite␣
↪database.")
csv_file = "movies.csv"
db_name = "movies_db.db"
table_name = "movies"
csv_to_sqlite(csv_file, db_name, table_name)
Args:
db_name (str): The name of the SQLite database file.
query (str): The SQL query to run.
Returns:
list: Query result as a list of tuples, or an empty list if no results␣
↪or error occurred.
"""
try:
# Connect to the SQLite database
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
except sqlite3.Error as e:
3
print(f"An error occurred while executing the query: {e}")
return []
if results:
for row in results:
print(row)
(970,)
[6]: # Helper function to get embeddings from OpenAI or any embedding model
def get_embeddings(text):
"""
Converts a text string into a vector embedding using OpenAI embeddings.
Args:
text (str): The text string to convert.
Returns:
np.array: A vector representation of the text.
"""
response = client.embeddings.create(input=text,␣
↪model="text-embedding-3-small")
embedding = np.array(response.data[0].embedding)
return embedding
4
[31]: def search_cache(question_embedding, threshold=0.1):
"""
Searches the FAISS index for a similar question.
Args:
question_embedding (np.array): The embedding of the user's question.
threshold (float): The similarity threshold for considering a hit.
Returns:
tuple: (sql_query, response) if a hit is found, otherwise None.
"""
if index.ntotal > 0:
distances, indices = index.search(np.array([question_embedding]), k=1)
# print(distances)
# print(indices)
# Check if the closest distance is below the threshold
if distances[0][0] < threshold:
cache_index = indices[0][0]
return cache[cache_index][1], cache[cache_index][2]
return None
Args:
db_name (str): The name of the SQLite database file.
table_name (str): The name of the table.
Returns:
list: A list of tuples with column name, data type, and other info.
"""
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
conn.close()
return schema
table_name = 'movies'
schema = get_table_schema(db_name, table_name)
print(f"Schema for {table_name}:")
for col in schema:
5
print(col)
Args:
table_name (str): The name of the table.
table_schema (list): A list of tuples where each tuple contains␣
↪information about the columns in the table.
Returns:
str: The generated prompt to be used by the LLM.
"""
prompt = f"""You are an expert in writing SQL queries for relational␣
↪databases.
The database has a table named '{table_name}' with the following schema:
↪\n\n"""
prompt += "Columns:\n"
6
prompt += "\nPlease generate a SQL query based on the following natural␣
↪language question. ONLY return the SQL query."
return prompt
table_name = "movies"
schema = get_table_schema(db_name, table_name)
# Generate the prompt
llm_prompt = generate_llm_prompt(table_name, schema)
print(llm_prompt)
The database has a table named 'movies' with the following schema:
Columns:
- Movie (TEXT)
- LeadStudio (TEXT)
- RottenTomatoes (REAL)
- AudienceScore (REAL)
- Story (TEXT)
- Genre (TEXT)
- TheatersOpenWeek (REAL)
- OpeningWeekend (REAL)
- BOAvgOpenWeekend (REAL)
- DomesticGross (REAL)
- ForeignGross (REAL)
- WorldGross (REAL)
- Budget (REAL)
- Profitability (REAL)
- OpenProfit (REAL)
- Year (INTEGER)
Please generate a SQL query based on the following natural language question.
ONLY return the SQL query.
Args:
user_question (str): The user's natural language question.
7
Returns:
list: The response to the user's question.
"""
# Convert the user's question to an embedding
question_embedding = get_embeddings(user_question)
return response
8
[37]: # question = "total number of movies are made by Warner Bros company in year␣
↪2008?"
# question = "how many movies have RottenTomatoes scores lower than 85?"
question = "how many movies with action genre are in the database"
handle_user_question(question)
[37]: [(166,)]
[38]: cache
[38]: [('total number of movies are made by Warner Bros company in year 2008?',
"SELECT COUNT(*) \nFROM movies \nWHERE LeadStudio = 'Warner Bros' AND Year =
2008;",
[(21,)]),
('how many movies have RottenTomatoes scores greater than 85?',
'SELECT COUNT(*) \nFROM movies \nWHERE RottenTomatoes > 85;',
[(120,)]),
('how many movies have RottenTomatoes scores lower than 85?',
'SELECT COUNT(*) \nFROM movies \nWHERE RottenTomatoes < 85;',
[(782,)]),
('how many movies with action genre are in the database',
"SELECT COUNT(*) AS ActionMovieCount\nFROM movies\nWHERE Genre = 'Action';",
[(166,)])]
[ ]:
[ ]: