aws-samples · supinyu · Aug 22, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 22, 2024
diff --git a/application/nlq/data_access/database.py b/application/nlq/data_access/database.py
@@ -15,6 +15,7 @@ class RelationDatabase():
  'redshift': 'redshift+psycopg2',
  'starrocks': 'starrocks',
  'clickhouse': 'clickhouse',
+ 'hive': 'hive'
  # Add more mappings here for other databases
  }
 

diff --git a/application/pages/2_🪙_Data_Connection_Management.py b/application/pages/2_🪙_Data_Connection_Management.py
@@ -14,6 +14,7 @@
  'redshift': 'Redshift',
  'starrocks': 'StarRocks',
  'clickhouse': 'Clickhouse',
+ 'hive': 'Hive'
 }
 
 

diff --git a/application/requirements-api.txt b/application/requirements-api.txt
@@ -19,4 +19,8 @@ starrocks==1.0.6
 clickhouse-sqlalchemy==0.2.6
 sagemaker
 python-jose
-sqlalchemy-redshift~=0.8.14
+sqlalchemy-redshift~=0.8.14
+numpy==1.26.4
+pyhive==0.7.0
+thrift==0.20.0
+thrift-sasl==0.4.3
diff --git a/application/requirements.txt b/application/requirements.txt
@@ -18,4 +18,8 @@ starrocks==1.0.6
 clickhouse-sqlalchemy==0.2.6
 sagemaker
 fastapi~=0.110.1
-sqlalchemy-redshift~=0.8.14
+sqlalchemy-redshift~=0.8.14
+numpy==1.26.4
+pyhive==0.7.0
+thrift==0.20.0
+thrift-sasl==0.4.3
diff --git a/application/utils/prompt.py b/application/utils/prompt.py
@@ -67,6 +67,18 @@
 | TRUNC | DATE |
 </data_time_function_list>""".format(top_k=TOP_K)
 
+HIVE_DIALECT_PROMPT_CLAUDE3 ="""You are a data analysis expert and proficient in Hive SQL. Given an input question, first create a syntactically correct Hive SQL query to run.
+Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per Hive SQL. 
+Never query for all columns from a table. You must query only the columns that are needed to answer the question. In Hive, column names are typically not wrapped in quotes, so use them as-is.
+Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
+Pay attention to use CURRENT_DATE function to get the current date, if the question involves "today". 
+Note that Hive has some differences from traditional SQL:
+1. Use backticks (`) instead of double quotes for table or column names if they contain spaces or are reserved keywords.
+2. Some functions may have different names or syntax, e.g., use concat() instead of ||.
+3. Hive is case-insensitive for keywords and function names.
+4. Hive supports both SQL-style comments (-- and /* */) and Hive-style comments (-- and /*+ */).
+Aside from giving the SQL answer, concisely explain yourself after giving the answer in the same language as the question.""".format(top_k=TOP_K)
+
 SEARCH_INTENT_PROMPT_CLAUDE3 = """You are an intent classifier and entity extractor, and you need to perform intent classification and entity extraction on search queries.
 Background: I want to query data in the database, and you need to help me determine the user's relevant intent and extract the keywords from the query statement. Finally, return a JSON structure.
 

diff --git a/application/utils/prompts/generate_prompt.py b/application/utils/prompts/generate_prompt.py
@@ -1,5 +1,6 @@
 from utils.prompt import POSTGRES_DIALECT_PROMPT_CLAUDE3, MYSQL_DIALECT_PROMPT_CLAUDE3, \
- DEFAULT_DIALECT_PROMPT, AGENT_COT_EXAMPLE, AWS_REDSHIFT_DIALECT_PROMPT_CLAUDE3, STARROCKS_DIALECT_PROMPT_CLAUDE3, CLICKHOUSE_DIALECT_PROMPT_CLAUDE3
+ DEFAULT_DIALECT_PROMPT, AGENT_COT_EXAMPLE, AWS_REDSHIFT_DIALECT_PROMPT_CLAUDE3, STARROCKS_DIALECT_PROMPT_CLAUDE3, \
+ CLICKHOUSE_DIALECT_PROMPT_CLAUDE3, HIVE_DIALECT_PROMPT_CLAUDE3
 from utils.prompts import guidance_prompt
 from utils.prompts import table_prompt
 import logging
@@ -2206,6 +2207,8 @@ def generate_llm_prompt(ddl, hints, prompt_map, search_box, sql_examples=None, n
  dialect_prompt = STARROCKS_DIALECT_PROMPT_CLAUDE3
  elif dialect == 'clickhouse':
  dialect_prompt = CLICKHOUSE_DIALECT_PROMPT_CLAUDE3
+ elif dialect == 'hive':
+ dialect_prompt = HIVE_DIALECT_PROMPT_CLAUDE3
  else:
  dialect_prompt = DEFAULT_DIALECT_PROMPT
 

diff --git a/application/utils/tool.py b/application/utils/tool.py
@@ -2,7 +2,7 @@
 import logging
 import time
 import random
-from datetime import datetime
+import datetime
 
 import pandas as pd
 
@@ -28,7 +28,7 @@ def generate_log_id():
 
 
 def get_current_time():
- now = datetime.now()
+ now = datetime.datetime.now()
  formatted_time = now.strftime('%Y-%m-%d %H:%M:%S')
  return formatted_time
 
@@ -60,6 +60,9 @@ def convert_timestamps_to_str(data):
  if isinstance(item, pd.Timestamp):
  # Convert Timestamp to string
  new_row.append(item.strftime('%Y-%m-%d %H:%M:%S'))
+ elif isinstance(item, datetime.date):
+ # Convert datetime.date to string
+ new_row.append(item.strftime('%Y-%m-%d %H:%M:%S'))
  else:
  new_row.append(item)
  converted_data.append(new_row)