Building a Data Analysis AI Agent: From Basics to Real‑World Implementation

This article walks through the design and implementation of a data‑analysis AI agent that converts natural‑language queries into SQL, executes them on a SQLite sales database, generates visualizations, and produces insight reports, complete with architecture diagrams and full Python code examples.

Coder Trainee
Coder Trainee
Coder Trainee
Building a Data Analysis AI Agent: From Basics to Real‑World Implementation

1. Feature Overview

Natural Language to SQL (Text‑to‑SQL) : Convert Chinese questions such as "查询上个月的销售额" into executable SQL statements.

Data Query & Statistical Analysis : Retrieve and aggregate data from the database.

Automatic Chart Generation : Produce Python code for matplotlib/seaborn charts (e.g., "画一个销售趋势图").

Data Insight & Report Generation : Summarize findings and generate analysis reports (e.g., "生成月度分析报告").

2. Core Architecture

┌─────────────────────────────────────────────────────────────────┐
│               数据分析 Agent 架构                              │
├─────────────────────────────────────────────────────────────────┤
│                                                             │
│   用户问题 ──► 理解意图 ──► 生成 SQL ──► 执行查询 ──► 分析结果 │
│            │          │          │                     │
│            ▼          ▼          ▼                     │
│         ┌─────────┐ ┌─────────┐ ┌─────────┐            │
│         │ Schema  │ │ 数据库 │ │ 可视化 │            │
│         │ 理解   │ │ 查询   │ │ 生成   │            │
│         └─────────┘ └─────────┘ └─────────┘            │
│                                                             │
└─────────────────────────────────────────────────────────────────┘

3. Code Implementation

3.1 Schema Understanding

# tools/schema_tools.py
from langchain.tools import tool
import sqlite3

@tool
def get_table_schema(table_name: str) -> str:
    """获取数据库表结构"""
    conn = sqlite3.connect("sales.db")
    cursor = conn.cursor()
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = cursor.fetchall()
    conn.close()
    schema = f"表 {table_name} 的结构:
"
    for col in columns:
        schema += f"- {col[1]} ({col[2]})
"
    return schema

@tool
def list_tables() -> str:
    """列出所有数据表"""
    conn = sqlite3.connect("sales.db")
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = cursor.fetchall()
    conn.close()
    return "数据库中的表:
" + "
".join([t[0] for t in tables])

3.2 Text‑to‑SQL

# tools/sql_tools.py
from langchain.tools import tool
from langchain_openai import ChatOpenAI
import sqlite3, pandas as pd

llm = ChatOpenAI(model="gpt-4", temperature=0)

@tool
def text_to_sql(question: str, schema: str) -> str:
    """将自然语言问题转换为 SQL"""
    prompt = f"""
根据以下数据库表结构,将用户问题转换为 SQL 查询。

表结构:
{schema}

用户问题:
{question}

要求:
1. 只输出 SQL 语句,不要有其他内容
2. 使用标准 SQL 语法
3. 考虑时间范围(如\"上个月\"需要计算)
"""
    response = llm.invoke(prompt)
    return response.content

@tool
def execute_sql(sql: str) -> str:
    """执行 SQL 查询并返回结果"""
    conn = sqlite3.connect("sales.db")
    try:
        df = pd.read_sql_query(sql, conn)
        conn.close()
        return df.to_string()
    except Exception as e:
        return f"查询错误: {e}"

3.3 Visualization Tools

# tools/viz_tools.py
from langchain.tools import tool
import matplotlib.pyplot as plt, seaborn as sns, pandas as pd

@tool
def generate_chart_code(data_desc: str, chart_type: str) -> str:
    """生成图表代码"""
    prompt = f"""
为以下数据生成 {chart_type} 图表的 Python 代码(使用 matplotlib/seaborn)。

数据描述:
{data_desc}

要求:
1. 代码要完整可运行
2. 添加必要的标签和标题
3. 使用中文显示
"""
    response = llm.invoke(prompt)
    return response.content

@tool
def suggest_chart_type(question: str) -> str:
    """根据分析需求推荐图表类型"""
    prompt = f"""
根据以下分析需求,推荐最合适的图表类型。

需求:
{question}

可选图表:折线图、柱状图、饼图、散点图、箱线图、热力图

输出:图表类型和推荐理由
"""
    response = llm.invoke(prompt)
    return response.content

3.4 Data Analysis Agent

# agent/data_agent.py
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from tools.schema_tools import *
from tools.sql_tools import *
from tools.viz_tools import *

class DataAnalysisAgent:
    """数据分析 Agent"""
    def __init__(self, db_path: str = "sales.db"):
        self.db_path = db_path
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.tools = [list_tables, get_table_schema, text_to_sql, execute_sql,
                      analyze_data, suggest_chart_type, generate_chart_code]
        self._setup_agent()

    def _setup_agent(self):
        prompt = ChatPromptTemplate.from_messages([
            ("system", """你是专业的数据分析师,擅长 SQL 查询和数据分析。

## 工作流程
1. 理解用户问题
2. 获取相关表结构
3. 生成并执行 SQL
4. 分析数据结果
5. 提供洞察和建议

## 可用工具
- list_tables: 查看所有表
- get_table_schema: 查看表结构
- text_to_sql: 自然语言转 SQL
- execute_sql: 执行 SQL
- analyze_data: 分析数据
- suggest_chart_type: 推荐图表
- generate_chart_code: 生成图表代码

请帮助用户分析数据。"""),
            ("human", "{input}"),
            ("placeholder", "{agent_scratchpad}")
        ])
        self.agent = create_tool_calling_agent(self.llm, self.tools, prompt)
        self.executor = AgentExecutor(agent=self.agent, tools=self.tools, verbose=True)

    def ask(self, question: str) -> str:
        """提问"""
        return self.executor.invoke({"input": question})["output"]

3.5 Example Database Initialization

# scripts/init_db.py
import sqlite3, pandas as pd, random
from datetime import datetime, timedelta

def init_sales_db():
    """初始化销售数据库"""
    conn = sqlite3.connect("sales.db")
    products = ["产品A", "产品B", "产品C", "产品D"]
    regions = ["北京", "上海", "广州", "深圳"]
    dates = [datetime(2025, 1, 1) + timedelta(days=i) for i in range(90)]
    data = []
    for i, date in enumerate(dates):
        for product in products:
            for region in regions:
                sales = random.randint(1000, 50000)
                quantity = random.randint(10, 500)
                data.append([date.strftime("%Y-%m-%d"), product, region, sales, quantity])
    df = pd.DataFrame(data, columns=["date", "product", "region", "sales", "quantity"])
    df.to_sql("sales", conn, if_exists="replace", index=False)
    # 用户表
    users = [
        (1, "张三", "北京", "2024-01-01"),
        (2, "李四", "上海", "2024-01-15"),
        (3, "王五", "广州", "2024-02-01")
    ]
    pd.DataFrame(users, columns=["id", "name", "city", "register_date"]).to_sql("users", conn, if_exists="replace", index=False)
    conn.close()
    print("数据库初始化完成")

if __name__ == "__main__":
    init_sales_db()

4. Running the Agent

# Install dependencies
pip install pandas matplotlib seaborn sqlite3

# Initialize the example database
python scripts/init_db.py

# Run the data‑analysis agent (e.g., from a Python REPL)
python -c "from agent.data_agent import DataAnalysisAgent; print(DataAnalysisAgent().ask('查询上个月的销售额'))"

我是老 J,下期见

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Sign in to view source
Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactadmin@besthub.devand we will review it promptly.

PythonllmLangChainAI AgentSQLiteText-to-SQLData Visualization
Coder Trainee
Written by

Coder Trainee

Experienced in Java and Python, we share and learn together. For submissions or collaborations, DM us.

0 followers
Reader feedback

How this landed with the community

Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.