Merge branch 'dev' into staging
This commit is contained in:
commit
781f1af3f6
@ -28,6 +28,13 @@ jobs:
|
|||||||
region: AWS_REGION
|
region: AWS_REGION
|
||||||
repo: <<parameters.repo>>
|
repo: <<parameters.repo>>
|
||||||
tag: '<<parameters.docker-tag>>${CIRCLE_SHA1}'
|
tag: '<<parameters.docker-tag>>${CIRCLE_SHA1}'
|
||||||
|
- run:
|
||||||
|
name: Send image push Lark notification
|
||||||
|
command: |
|
||||||
|
bash scripts/ci/notify_feishu.sh \
|
||||||
|
--event image_push \
|
||||||
|
--image-repo "$AWS_ECR_ACCOUNT_URL/<<parameters.repo>>" \
|
||||||
|
--version-tag "<<parameters.docker-tag>>${CIRCLE_SHA1}"
|
||||||
deploy:
|
deploy:
|
||||||
machine:
|
machine:
|
||||||
image: ubuntu-2204:current
|
image: ubuntu-2204:current
|
||||||
@ -67,6 +74,9 @@ jobs:
|
|||||||
type: string
|
type: string
|
||||||
docker-tag:
|
docker-tag:
|
||||||
type: string
|
type: string
|
||||||
|
deploy:
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
steps:
|
steps:
|
||||||
- checkout
|
- checkout
|
||||||
- run:
|
- run:
|
||||||
@ -92,11 +102,59 @@ jobs:
|
|||||||
docker push <<parameters.repo>>:<<parameters.docker-tag>>
|
docker push <<parameters.repo>>:<<parameters.docker-tag>>
|
||||||
docker push <<parameters.repo>>:$IMAGE_TAG
|
docker push <<parameters.repo>>:$IMAGE_TAG
|
||||||
|
|
||||||
|
# 把 IMAGE_TAG 透传到后续 step(CD SSH 部署需要使用)
|
||||||
|
echo "export IMAGE_TAG=$IMAGE_TAG" >> $BASH_ENV
|
||||||
|
|
||||||
bash scripts/ci/notify_feishu.sh \
|
bash scripts/ci/notify_feishu.sh \
|
||||||
--event docker_hub \
|
--event docker_hub \
|
||||||
--image-repo <<parameters.repo>> \
|
--image-repo <<parameters.repo>> \
|
||||||
--image-tag <<parameters.docker-tag>> \
|
--image-tag <<parameters.docker-tag>> \
|
||||||
--version-tag "$IMAGE_TAG"
|
--version-tag "$IMAGE_TAG"
|
||||||
|
- when:
|
||||||
|
# 仅当 deploy=true 且当前分支为 onprem-release 时才触发 CD,避免其他分支误部署
|
||||||
|
condition:
|
||||||
|
and:
|
||||||
|
- << parameters.deploy >>
|
||||||
|
- equal: [ onprem-release, << pipeline.git.branch >> ]
|
||||||
|
steps:
|
||||||
|
# 将预先在 CircleCI 项目设置 → SSH Keys 上传的私钥加载到 ssh-agent
|
||||||
|
# ONPREM_DEPLOY_SSH_KEY_FINGERPRINT 是上传私钥后 CircleCI 返回的 MD5 指纹
|
||||||
|
- add_ssh_keys:
|
||||||
|
fingerprints:
|
||||||
|
- "$ONPREM_DEPLOY_SSH_KEY_FINGERPRINT"
|
||||||
|
- run:
|
||||||
|
name: SSH 部署到服务器(更新 catalog-agent 镜像并重启)
|
||||||
|
command: |
|
||||||
|
# 把服务器公钥写入 known_hosts,避免首次连接时的交互确认
|
||||||
|
# 服务器 SSH 端口通过 ONPREM_DEPLOY_SSH_PORT 环境变量控制(例如 17290)
|
||||||
|
mkdir -p ~/.ssh
|
||||||
|
ssh-keyscan -H -p "$ONPREM_DEPLOY_SSH_PORT" "$ONPREM_DEPLOY_SSH_HOST" >> ~/.ssh/known_hosts 2>/dev/null
|
||||||
|
|
||||||
|
# 将本次生成的 IMAGE_TAG 透传到远端脚本
|
||||||
|
# - 本地 shell 展开 $IMAGE_TAG 组装成远端命令前缀
|
||||||
|
# - heredoc 使用 'REMOTE' 单引号形式,避免本地对脚本体再次展开
|
||||||
|
# - sed 直接替换 catalog-agent 的整行 image 字段(兼容 "gptbasesparticle/..." 或
|
||||||
|
# "docker.gbase.ai/..." 等任意仓库前缀、以及 0.0.x 等任意标签格式)
|
||||||
|
# - 通过「标签首字符是数字」排除 arm64 等非 AMD64 镜像干扰(本 CD 仅在 AMD64 job 触发)
|
||||||
|
ssh -p "$ONPREM_DEPLOY_SSH_PORT" "$ONPREM_DEPLOY_SSH_USER@$ONPREM_DEPLOY_SSH_HOST" \
|
||||||
|
"IMAGE_TAG='$IMAGE_TAG' bash -s" \<<'REMOTE'
|
||||||
|
set -euo pipefail
|
||||||
|
cd gbase_onprem
|
||||||
|
echo "更新前 catalog-agent 镜像行:"
|
||||||
|
grep -E '^[[:space:]]*image:[[:space:]]*[^#[:space:]]*catalog-agent:[0-9]' docker-compose.yml || true
|
||||||
|
sed -i -E "s|^([[:space:]]*)image:[[:space:]]*[^#[:space:]]*catalog-agent:[0-9][^[:space:]]*|\1image: gptbasesparticle/catalog-agent:${IMAGE_TAG}|" docker-compose.yml
|
||||||
|
echo "更新后 catalog-agent 镜像行:"
|
||||||
|
grep -E "^[[:space:]]*image:[[:space:]]*gptbasesparticle/catalog-agent:${IMAGE_TAG}" docker-compose.yml
|
||||||
|
docker compose down catalog-agent
|
||||||
|
docker compose up catalog-agent -d
|
||||||
|
REMOTE
|
||||||
|
|
||||||
|
bash scripts/ci/notify_feishu.sh \
|
||||||
|
--event deploy \
|
||||||
|
--service-name catalog-agent \
|
||||||
|
--namespace onprem-release \
|
||||||
|
--image-repo <<parameters.repo>> \
|
||||||
|
--version-tag "$IMAGE_TAG"
|
||||||
|
|
||||||
workflows:
|
workflows:
|
||||||
backend_build_and_push:
|
backend_build_and_push:
|
||||||
@ -184,6 +242,8 @@ workflows:
|
|||||||
repo: gptbasesparticle/catalog-agent
|
repo: gptbasesparticle/catalog-agent
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
docker-tag: latest
|
docker-tag: latest
|
||||||
|
# 启用 CD:构建&推送完成后 SSH 到服务器更新 catalog-agent 镜像(仅 onprem-release 分支生效,见 job 内 when 条件)
|
||||||
|
deploy: true
|
||||||
filters:
|
filters:
|
||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
|
|||||||
@ -84,6 +84,7 @@ build_deploy_payload() {
|
|||||||
local job
|
local job
|
||||||
local service_name
|
local service_name
|
||||||
local namespace
|
local namespace
|
||||||
|
local image
|
||||||
local commit
|
local commit
|
||||||
local full_sha
|
local full_sha
|
||||||
|
|
||||||
@ -93,6 +94,11 @@ build_deploy_payload() {
|
|||||||
job=$(printf '%s' "$CIRCLE_JOB" | escape_json)
|
job=$(printf '%s' "$CIRCLE_JOB" | escape_json)
|
||||||
service_name=$(printf '%s' "$SERVICE_NAME" | escape_json)
|
service_name=$(printf '%s' "$SERVICE_NAME" | escape_json)
|
||||||
namespace=$(printf '%s' "$NAMESPACE" | escape_json)
|
namespace=$(printf '%s' "$NAMESPACE" | escape_json)
|
||||||
|
if [[ -n "$IMAGE_REPO" && -n "$VERSION_TAG" ]]; then
|
||||||
|
image=$(printf '%s' "$IMAGE_REPO:$VERSION_TAG" | escape_json)
|
||||||
|
else
|
||||||
|
image="-"
|
||||||
|
fi
|
||||||
commit=$(printf '%s' "$short_sha" | escape_json)
|
commit=$(printf '%s' "$short_sha" | escape_json)
|
||||||
full_sha=$(printf '%s' "$CIRCLE_SHA1" | escape_json)
|
full_sha=$(printf '%s' "$CIRCLE_SHA1" | escape_json)
|
||||||
|
|
||||||
@ -149,6 +155,13 @@ build_deploy_payload() {
|
|||||||
"content": "**命名空间**\n${namespace}"
|
"content": "**命名空间**\n${namespace}"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"is_short": true,
|
||||||
|
"text": {
|
||||||
|
"tag": "lark_md",
|
||||||
|
"content": "**镜像**\n${image}"
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"is_short": true,
|
"is_short": true,
|
||||||
"text": {
|
"text": {
|
||||||
@ -192,7 +205,7 @@ build_deploy_payload() {
|
|||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
build_docker_hub_payload() {
|
build_image_push_payload() {
|
||||||
local color="$1"
|
local color="$1"
|
||||||
local short_sha="${CIRCLE_SHA1:0:8}"
|
local short_sha="${CIRCLE_SHA1:0:8}"
|
||||||
local message
|
local message
|
||||||
@ -222,7 +235,7 @@ build_docker_hub_payload() {
|
|||||||
"template": "${color}",
|
"template": "${color}",
|
||||||
"title": {
|
"title": {
|
||||||
"tag": "plain_text",
|
"tag": "plain_text",
|
||||||
"content": "catalog-agent Docker Hub 推送成功"
|
"content": "catalog-agent 镜像推送成功"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"elements": [
|
"elements": [
|
||||||
@ -308,14 +321,21 @@ send_payload() {
|
|||||||
case "$EVENT" in
|
case "$EVENT" in
|
||||||
deploy)
|
deploy)
|
||||||
case "$CIRCLE_BRANCH" in
|
case "$CIRCLE_BRANCH" in
|
||||||
dev|staging|prod|onprem-dev)
|
dev|staging|prod|onprem-dev|onprem-release)
|
||||||
send_payload "$(build_deploy_payload "$(header_color)")"
|
send_payload "$(build_deploy_payload "$(header_color)")"
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
;;
|
;;
|
||||||
|
image_push)
|
||||||
|
case "$CIRCLE_BRANCH" in
|
||||||
|
dev|staging|prod|onprem-dev|onprem-release)
|
||||||
|
send_payload "$(build_image_push_payload "$(header_color)")"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
;;
|
||||||
docker_hub)
|
docker_hub)
|
||||||
if [[ "$CIRCLE_BRANCH" == "onprem-release" && "$IMAGE_TAG" == "latest" ]]; then
|
if [[ "$CIRCLE_BRANCH" == "onprem-release" && "$IMAGE_TAG" == "latest" ]]; then
|
||||||
send_payload "$(build_docker_hub_payload "$(header_color)")"
|
send_payload "$(build_image_push_payload "$(header_color)")"
|
||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
|
|||||||
184
skills/onprem/kfs-answer/SKILL.md
Normal file
184
skills/onprem/kfs-answer/SKILL.md
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
---
|
||||||
|
name: kfs-answer
|
||||||
|
description: Primary skill for answering ALL questions about the datasets knowledge base. Search files, run queries (SQL / markdown), and return answers with citations. MUST be used first for any data-related question.
|
||||||
|
---
|
||||||
|
|
||||||
|
# kfs-answer
|
||||||
|
|
||||||
|
Answer ALL questions about the datasets knowledge base using this skill's scripts. This is the **primary and mandatory** tool for any question involving uploaded data files. Do NOT explore the filesystem, write Python code, or use other tools to access dataset content — all data access goes through the scripts below.
|
||||||
|
|
||||||
|
## Inputs
|
||||||
|
|
||||||
|
- `{user_question}` — the user's question
|
||||||
|
- `{chat_history}` — recent conversation context (may be empty)
|
||||||
|
|
||||||
|
Scripts are in `{SKILL_DIR}/scripts/`.
|
||||||
|
|
||||||
|
Datasets are auto-discovered by scripts from `./dataset/` (catalog-agent) or `./datasets/` (gbase-agent-service) subdirectories — agent does NOT need to know or pass dataset IDs.
|
||||||
|
|
||||||
|
## Scripts
|
||||||
|
|
||||||
|
- `python3 {SKILL_DIR}/scripts/search.py <query> <kw1> <kw2> ...` — scan knowledge files, return RECOMMENDED file:sheet pairs with compact summaries (source name, L0, L1, per-sheet description with fallback).
|
||||||
|
- `python3 {SKILL_DIR}/scripts/detail.py <file_id1:sheet_id1>,<file_id2:sheet_id2>` — return full schema (columns with types/stats) + sample data
|
||||||
|
- `python3 {SKILL_DIR}/scripts/query.py <file_id1:sheet_id1>,... <question> <kw1> <kw2> ...` — budget-aware auto query (db: keyword SQL, markdown: section match)
|
||||||
|
- `python3 {SKILL_DIR}/scripts/query_db.py <db_path> <SQL> [--offset N]` — execute custom SQL with auto-pagination.
|
||||||
|
|
||||||
|
**query_db.py output structure:** TSV header + data rows + status line at the end.
|
||||||
|
|
||||||
|
**Status line — three cases:**
|
||||||
|
|
||||||
|
1. `[RESULT: N/N rows returned | COMPLETE]`
|
||||||
|
All data fully returned. Proceed to answer.
|
||||||
|
|
||||||
|
2. `[RESULT: K/total returned | this batch: rows X-Y (offset A-B) | PARTIAL — call again with --offset=M]`
|
||||||
|
Output size limit reached; more data remaining.
|
||||||
|
- `K/total` — cumulative rows returned so far / total matching rows
|
||||||
|
- `this batch: rows X-Y` — 1-indexed row range this call returned
|
||||||
|
- Re-invoke with SAME `<db_path>` and `<SQL>`, adding `--offset M` as CLI arg. Repeat until COMPLETE.
|
||||||
|
- If `total` is very large (1000+), consider reducing SELECT columns or adding WHERE filters instead of paginating.
|
||||||
|
|
||||||
|
3. `[RESULT: 0 rows | EMPTY]` — query matched no rows.
|
||||||
|
`[RESULT: 0 rows | offset N exceeds total M | call again with --offset=0]` — offset out of range.
|
||||||
|
|
||||||
|
**Pagination rules:**
|
||||||
|
- `--offset` is a COMMAND-LINE argument, NOT a SQL clause. Do NOT write `OFFSET N` in SQL.
|
||||||
|
- Do NOT use SQL `LIMIT`/`OFFSET` to manually control output size — pagination handles it automatically.
|
||||||
|
- You MAY use SQL `LIMIT` when the question genuinely requires it (e.g. "top 10 by revenue").
|
||||||
|
- Keep the SQL string character-for-character IDENTICAL across pagination calls.
|
||||||
|
|
||||||
|
- `python3 {SKILL_DIR}/scripts/merge_citations.py` — merge accumulated citations from query.py/query_db.py into final `<CITATION>` tags. **MUST call once before composing answer (Step 4), regardless of which query path was used.**
|
||||||
|
|
||||||
|
Note: file:sheet pairs are comma-separated strings. Keywords are SEPARATE positional arguments — one keyword per arg, placed after the fixed args.
|
||||||
|
|
||||||
|
## Protocol
|
||||||
|
|
||||||
|
### Step 1 — search
|
||||||
|
|
||||||
|
Consider chat_history to understand full context. Extract keywords from user_question (in the question's language). Then:
|
||||||
|
|
||||||
|
```
|
||||||
|
Bash: python3 {SKILL_DIR}/scripts/search.py "<rewritten_question>" <kw1> <kw2> ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Example: `python3 {SKILL_DIR}/scripts/search.py "delivery report" delivery report overdue`
|
||||||
|
|
||||||
|
If output shows `NO_MATCH`, answer: "The dataset does not contain data relevant to this question."
|
||||||
|
|
||||||
|
### Step 2 — query
|
||||||
|
|
||||||
|
From search output, pick ONLY the file_id:sheet_id pairs relevant to the question (often just 1 file).
|
||||||
|
|
||||||
|
**Before calling query.py, classify your keywords against the search output (sheet names + L0 + L1 + per-sheet description):**
|
||||||
|
- **Table-level**: keyword appears in sheet name or L0 description → it describes the file/sheet scope, not individual rows. Do NOT pass as row-level filter.
|
||||||
|
Example: question asks about "福井県のBCP企業" → "福井県" is the sheet name (all rows belong to Fukui). Do not use it as a WHERE keyword.
|
||||||
|
- **Column-level**: keyword matches a concept mentioned in L0 as a data dimension → determines which columns to look at, not a WHERE filter.
|
||||||
|
Example: L0 says "エネルギー・たんぱく質・脂質等68項目" → "エネルギー" is a column concept, not a row filter.
|
||||||
|
- **Row-level**: keyword refers to a specific entity/item not mentioned in sheet names or L0 → use as query.py keywords for WHERE filtering.
|
||||||
|
Example: "アーモンド" is a specific food item, not in sheet name or L0 → valid row-level keyword.
|
||||||
|
|
||||||
|
Only pass **row-level keywords** to query.py:
|
||||||
|
|
||||||
|
```
|
||||||
|
Bash: python3 {SKILL_DIR}/scripts/query.py "<recommended_pairs>" "<question>" <row_kw1> <row_kw2> ...
|
||||||
|
```
|
||||||
|
|
||||||
|
This handles ~80% of questions directly. Check the results:
|
||||||
|
- **Sufficient** (no `[BUDGET]` tag, or truncation is acceptable) → go to Step 4 (answer). Done.
|
||||||
|
- **Insufficient** (`[BUDGET]` shows missing rows/columns critical to the question) → go to Step 3. **Discard query.py results completely** — query_db.py uses different SQL and ordering, so do NOT use `--offset` to "continue" from query.py. Always start query_db.py from offset=0.
|
||||||
|
- **Suspiciously few** (≤3 rows returned, but question asks for "最初/一覧/全部/比較" or total row count is much larger) → results are likely incomplete. Remove the most restrictive keyword and re-run query.py, or use empty keywords to get a broader view. If still unclear, go to Step 3.
|
||||||
|
|
||||||
|
### Sheet selection from multi-sheet files
|
||||||
|
|
||||||
|
When a RECOMMENDED file has multiple sheets (e.g., `sheet_001`/`sheet_002`, `7-2-2図①`/`7-2-2図②`, `基本票`/`詳細票`), the technical sheet names may not convey semantics. The search output now includes a per-sheet description line for each sheet. Use it to select the correct sheet:
|
||||||
|
|
||||||
|
```
|
||||||
|
- 7-2-2図①[db,30]: ①女性:14歳以上の年齢層別女性人口の推移...
|
||||||
|
- 7-2-2図②[db,30]: ②男性:14歳以上の年齢層別男性人口の推移...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Do NOT infer sheet identity from**:
|
||||||
|
- data value heuristics (e.g., "larger value = female")
|
||||||
|
- technical sheet id / name alone (e.g., `sheet_001`, `7-2-2図①`)
|
||||||
|
|
||||||
|
If the per-sheet description is missing, short, or ambiguous, call `detail.py` to get the full sheet description before issuing a WHERE/filter decision.
|
||||||
|
|
||||||
|
### Step 3 — detail + refine (only if Step 2 insufficient)
|
||||||
|
|
||||||
|
Call detail.py to understand the full schema, then write precise SQL:
|
||||||
|
|
||||||
|
```
|
||||||
|
Bash: python3 {SKILL_DIR}/scripts/detail.py "<recommended_pairs>"
|
||||||
|
```
|
||||||
|
|
||||||
|
Read the column names and types from detail output. Then write a targeted SQL query:
|
||||||
|
|
||||||
|
```
|
||||||
|
Bash: python3 {SKILL_DIR}/scripts/query_db.py "<db_path>" "SELECT col1,col2 FROM table WHERE ..."
|
||||||
|
```
|
||||||
|
|
||||||
|
**CRITICAL — Hidden `__src` column:**
|
||||||
|
Every db table has an `__src` column that is NOT shown in detail.py schema output (by design — the parser hides it from the human-readable schema). You MUST include `__src` as the FIRST column in every SELECT on a db table, regardless of what detail.py reports. Without it, per-row citation is impossible.
|
||||||
|
|
||||||
|
Example: `SELECT __src, col_a, col_b FROM sheet_001 WHERE col_a = 'x'`
|
||||||
|
|
||||||
|
The db_path is shown in query.py output. Pagination is automatic — see Scripts section above for query_db.py output protocol (COMPLETE / PARTIAL / EMPTY status line). When PARTIAL, follow the `--offset=N` instruction in the status line; keep calling until COMPLETE.
|
||||||
|
|
||||||
|
### Step 4 — output
|
||||||
|
|
||||||
|
Compose final answer, then append source attribution on the last line:
|
||||||
|
|
||||||
|
- Answer in the same language as the question
|
||||||
|
- When data was truncated, state total count and what was omitted (e.g., "Showing 20 of 32 results. Narrow your query for complete data.")
|
||||||
|
- Keep response concise — the output will be injected into another LLM's context with a ~3000 character budget
|
||||||
|
|
||||||
|
**Citations (MANDATORY):**
|
||||||
|
|
||||||
|
**ALWAYS** call `merge_citations.py` before composing your answer, regardless of whether you used query.py or query_db.py:
|
||||||
|
|
||||||
|
```
|
||||||
|
Bash: python3 {SKILL_DIR}/scripts/merge_citations.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This script reads all citation data accumulated by query.py / query_db.py, merges rows by (file, sheet), and outputs ready-to-use tags like:
|
||||||
|
|
||||||
|
```
|
||||||
|
[CITATIONS]
|
||||||
|
<CITATION file="a1b2c3d4-e5f6-7890-abcd-ef1234567890" filename="商品リスト.xlsx" sheet="1" rows="[2, 3, 4, 5]" />
|
||||||
|
```
|
||||||
|
|
||||||
|
Your job:
|
||||||
|
|
||||||
|
1. **Copy each CITATION tag EXACTLY as output by the script — character for character.** Do not modify any attributes. Rows are Excel row numbers (row 1 = header, data starts at row 2). Do NOT renumber, do NOT use range syntax. Do not invent tags.
|
||||||
|
|
||||||
|
2. **Place each tag after the paragraph / list / table that uses its data.** If only one tag, place it after the main content block. If multiple tags from different files, place each near the content that references that file.
|
||||||
|
|
||||||
|
3. **NEVER** put a tag on the same line as a list bullet or table row. **NEVER** write `__src=`.
|
||||||
|
|
||||||
|
**Calculation audit (mandatory when the answer involves arithmetic, aggregation, ratios, or percentages):**
|
||||||
|
|
||||||
|
Before writing the final answer, explicitly verify that every operand in your formula aligns with the question's scope. Output a short audit block:
|
||||||
|
|
||||||
|
```
|
||||||
|
[AUDIT]
|
||||||
|
Question scope: <entity / range / time period the question specifies>
|
||||||
|
Formula: <numerator> / <denominator> = <result> (or SUM, AVG, etc.)
|
||||||
|
Operand check:
|
||||||
|
- <operand1>: <value> — source: <row/column description> — matches question scope? YES/NO
|
||||||
|
- <operand2>: <value> — source: <row/column description> — matches question scope? YES/NO
|
||||||
|
Verdict: PASS — formula matches question semantics
|
||||||
|
OR WARNING — <operand X> scope mismatch: <explanation>. Re-querying.
|
||||||
|
```
|
||||||
|
|
||||||
|
If the verdict is WARNING, do NOT output the answer. Instead, re-query with corrected keywords to find the operand that matches the question's scope. Only output the answer after the audit passes.
|
||||||
|
|
||||||
|
Note: Pre-computed percentages or "share" values found in data remarks may use a different denominator than what the question asks. Always verify — never adopt them without confirming the denominator matches the question.
|
||||||
|
|
||||||
|
**Final output format:** Write your answer body with `<CITATION>` tags from merge_citations.py placed near the relevant content. Do not add anything else.
|
||||||
|
|
||||||
|
## Rules
|
||||||
|
|
||||||
|
1. **Query first.** Always try query.py before detail.py. Skip detail if query results are sufficient.
|
||||||
|
2. **Minimize turns.** Typical: 2 turns (search + query). Max: 3-4 turns (+ detail + query_db for complex cases).
|
||||||
|
3. **No exploratory reads.** Do not ls, Glob, or Read files. All info comes from the scripts.
|
||||||
|
4. **Verify before answering.** If query.py returns very few rows (≤3) for a listing/ranking question, do not assume the result is complete. Check if a table-level keyword was accidentally used as a row filter.
|
||||||
|
5. **Fallback flexibility.** query_db.py with custom SQL handles most needs including large result sets (via auto-pagination). Do NOT write inline Python (`sqlite3.connect`) to query knowledge.db — it bypasses query_db.py's auto-fix protections (fullwidth comma, identifier quoting, __src replacement) and causes debug loops. Accuracy over speed.
|
||||||
58
skills/onprem/kfs-answer/scripts/_session.py
Normal file
58
skills/onprem/kfs-answer/scripts/_session.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
"""Session directory management for kfs-answer scripts.
|
||||||
|
|
||||||
|
Protocol: expects the TRACE_ID env var (unique per agent session, injected by
|
||||||
|
the calling agent framework). Without it, exits — skill cannot guarantee
|
||||||
|
session isolation.
|
||||||
|
|
||||||
|
All kfs-answer temporary files (e.g. file_refs.txt) are written under
|
||||||
|
./kfs-answer-sessions/{TRACE_ID}/, isolated per session.
|
||||||
|
|
||||||
|
Cleanup: session dirs with mtime > 24h are removed on each call.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Derive project root from script location: scripts/ → kfs-answer/ → skills/ → project root
|
||||||
|
_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", ".."))
|
||||||
|
SESSIONS_DIR = os.path.join(_PROJECT_ROOT, "kfs-answer-sessions")
|
||||||
|
RETENTION_SEC = 86400 # 24h
|
||||||
|
|
||||||
|
|
||||||
|
def get_session_dir():
|
||||||
|
"""Return the session dir for this agent session. Create if missing. Cleanup old."""
|
||||||
|
trace_id = os.environ.get("TRACE_ID", "").strip()
|
||||||
|
if not trace_id:
|
||||||
|
sys.stderr.write(
|
||||||
|
"[ERROR: TRACE_ID env var not set. kfs-answer skill requires the calling "
|
||||||
|
"agent framework to inject TRACE_ID (unique per agent session) into the "
|
||||||
|
"subprocess env. This is a framework integration issue.\n"
|
||||||
|
" - catalog-agent: should be injected automatically by HTTP middleware\n"
|
||||||
|
" - gbase-agent-service: main.py must set env['TRACE_ID'] before subprocess\n"
|
||||||
|
" - Manual testing: run with `TRACE_ID=$(uuidgen) python3 ...`]\n"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
session_path = os.path.join(SESSIONS_DIR, trace_id)
|
||||||
|
os.makedirs(session_path, exist_ok=True)
|
||||||
|
os.utime(session_path, None) # refresh mtime = active marker
|
||||||
|
_cleanup_old()
|
||||||
|
return session_path
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_old():
|
||||||
|
if not os.path.isdir(SESSIONS_DIR):
|
||||||
|
return
|
||||||
|
now = time.time()
|
||||||
|
try:
|
||||||
|
entries = os.listdir(SESSIONS_DIR)
|
||||||
|
except OSError:
|
||||||
|
return
|
||||||
|
for name in entries:
|
||||||
|
full = os.path.join(SESSIONS_DIR, name)
|
||||||
|
try:
|
||||||
|
if os.path.isdir(full) and now - os.path.getmtime(full) > RETENTION_SEC:
|
||||||
|
shutil.rmtree(full, ignore_errors=True)
|
||||||
|
except OSError:
|
||||||
|
continue
|
||||||
241
skills/onprem/kfs-answer/scripts/detail.py
Normal file
241
skills/onprem/kfs-answer/scripts/detail.py
Normal file
@ -0,0 +1,241 @@
|
|||||||
|
"""Return full schema + sample data for specified file:sheet pairs.
|
||||||
|
|
||||||
|
Usage: python3 detail.py <file_id1:sheet_id1>,<file_id2:sheet_id2>,...
|
||||||
|
|
||||||
|
Output: Per sheet — columns with type/stats/description + sample rows (from knowledge.md body).
|
||||||
|
|
||||||
|
datasets directory: ./datasets/ (gbase-agent-service) or ./dataset/ (catalog-agent), auto-detected at runtime.
|
||||||
|
dataset_ids are discovered automatically from subdirectories under datasets directory.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from _session import get_session_dir
|
||||||
|
|
||||||
|
# Derive project root from script location: scripts/ → kfs-answer/ → skills/ → project root
|
||||||
|
_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||||
|
_ds = os.path.join(_PROJECT_ROOT, "datasets")
|
||||||
|
DATASETS_DIR = _ds if os.path.isdir(_ds) else os.path.join(_PROJECT_ROOT, "dataset")
|
||||||
|
|
||||||
|
|
||||||
|
def _discover_datasets():
|
||||||
|
"""Scan DATASETS_DIR for subdirectory names (each is a dataset_id)."""
|
||||||
|
if not os.path.isdir(DATASETS_DIR):
|
||||||
|
return []
|
||||||
|
return [d for d in sorted(os.listdir(DATASETS_DIR))
|
||||||
|
if os.path.isdir(os.path.join(DATASETS_DIR, d))]
|
||||||
|
|
||||||
|
|
||||||
|
def load_file_ref_map():
|
||||||
|
"""Load file_id → F{n} mapping from file_refs.txt (in session dir)."""
|
||||||
|
refs_path = os.path.join(get_session_dir(), "file_refs.txt")
|
||||||
|
mapping = {}
|
||||||
|
if not os.path.isfile(refs_path):
|
||||||
|
return mapping
|
||||||
|
ref_pat = re.compile(r"^(F\d+)=([0-9a-f-]+)\(")
|
||||||
|
with open(refs_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
m = ref_pat.match(line.strip())
|
||||||
|
if m:
|
||||||
|
mapping[m.group(2)] = m.group(1)
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
def find_file_dir(dataset_ids, file_id):
|
||||||
|
for dataset_id in dataset_ids:
|
||||||
|
candidate = os.path.join(DATASETS_DIR, dataset_id, file_id)
|
||||||
|
if os.path.isdir(candidate):
|
||||||
|
return candidate
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def load_knowledge(file_dir):
|
||||||
|
"""Parse knowledge.md → (meta dict, body text)."""
|
||||||
|
km_path = os.path.join(file_dir, "knowledge.md")
|
||||||
|
if not os.path.isfile(km_path):
|
||||||
|
return None, None
|
||||||
|
with open(km_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
if not content.startswith("---"):
|
||||||
|
return None, None
|
||||||
|
parts = content.split("---", 2)
|
||||||
|
if len(parts) < 3:
|
||||||
|
return None, None
|
||||||
|
meta = yaml.safe_load(parts[1])
|
||||||
|
body = parts[2].strip()
|
||||||
|
return meta, body
|
||||||
|
|
||||||
|
|
||||||
|
def extract_sheet_body(body, sheet_id):
|
||||||
|
"""Extract body section for a specific sheet (delimited by <!-- sheet_xxx [...] -->)."""
|
||||||
|
parts = re.split(r"<!--\s*sheet_\w+(?:\s+[^>]*)?\s*-->", body)
|
||||||
|
markers = re.findall(r"<!--\s*(sheet_\w+)(?:\s+[^>]*)?\s*-->", body)
|
||||||
|
|
||||||
|
for i, marker in enumerate(markers):
|
||||||
|
if marker == sheet_id and i < len(parts) - 1:
|
||||||
|
return parts[i + 1].strip()
|
||||||
|
# Fallback: if only one sheet and no markers, return entire body
|
||||||
|
if len(markers) == 0 and len(parts) == 1:
|
||||||
|
return body.strip()
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_sheet_src(body, sheet_id):
|
||||||
|
"""Extract __src value from <!-- sheet_xxx __src="F0S1" --> marker. Returns empty string if not found."""
|
||||||
|
m = re.search(rf'<!--\s*{re.escape(sheet_id)}\s+__src="([^"]*)"', body)
|
||||||
|
return m.group(1) if m else ""
|
||||||
|
|
||||||
|
|
||||||
|
def format_columns(columns):
|
||||||
|
"""Format columns as compact schema display."""
|
||||||
|
lines = []
|
||||||
|
for col in columns:
|
||||||
|
name = col.get("name", "?")
|
||||||
|
ctype = col.get("type", "text")
|
||||||
|
desc = col.get("description", "")
|
||||||
|
stats = []
|
||||||
|
if "distinct" in col:
|
||||||
|
stats.append(f"distinct={col['distinct']}")
|
||||||
|
if "null_rate" in col:
|
||||||
|
stats.append(f"null={col['null_rate']}")
|
||||||
|
if "avg_length" in col:
|
||||||
|
stats.append(f"avg_len={col['avg_length']}")
|
||||||
|
if "range" in col:
|
||||||
|
stats.append(f"range={col['range']}")
|
||||||
|
if "mean" in col:
|
||||||
|
stats.append(f"mean={col['mean']}")
|
||||||
|
if "sample" in col:
|
||||||
|
sample = col["sample"]
|
||||||
|
if isinstance(sample, list):
|
||||||
|
sample = ",".join(str(s) for s in sample[:5])
|
||||||
|
stats.append(f"sample=[{sample}]")
|
||||||
|
if "values" in col:
|
||||||
|
vals = col["values"]
|
||||||
|
if isinstance(vals, list):
|
||||||
|
vals = ",".join(str(v) for v in vals[:8])
|
||||||
|
stats.append(f"values=[{vals}]")
|
||||||
|
if "topics" in col:
|
||||||
|
topics = col["topics"]
|
||||||
|
if isinstance(topics, list):
|
||||||
|
topics = ",".join(str(t) for t in topics[:5])
|
||||||
|
stats.append(f"topics=[{topics}]")
|
||||||
|
|
||||||
|
stats_str = f" ({', '.join(stats)})" if stats else ""
|
||||||
|
desc_str = f" — {desc}" if desc else ""
|
||||||
|
lines.append(f" {name} [{ctype}]{stats_str}{desc_str}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Auto-discover datasets from ./dataset/ or ./datasets/ subdirectories
|
||||||
|
dataset_ids = _discover_datasets()
|
||||||
|
raw_entries = [e.strip() for e in sys.argv[1].split(",") if e.strip()]
|
||||||
|
|
||||||
|
# Load F0→F{n} mapping
|
||||||
|
f_ref_map = load_file_ref_map()
|
||||||
|
|
||||||
|
# Parse file_id:sheet_id pairs
|
||||||
|
entries = []
|
||||||
|
for entry in raw_entries:
|
||||||
|
if ":" in entry:
|
||||||
|
fid, sid = entry.split(":", 1)
|
||||||
|
entries.append((fid.strip(), sid.strip()))
|
||||||
|
else:
|
||||||
|
entries.append((entry.strip(), None))
|
||||||
|
|
||||||
|
# Group by file_id
|
||||||
|
file_sheets = {}
|
||||||
|
for fid, sid in entries:
|
||||||
|
file_sheets.setdefault(fid, []).append(sid)
|
||||||
|
|
||||||
|
for fid, sheet_ids in file_sheets.items():
|
||||||
|
file_dir = find_file_dir(dataset_ids, fid)
|
||||||
|
if not file_dir:
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"file_id: {fid}")
|
||||||
|
print(f" ERROR: not found")
|
||||||
|
continue
|
||||||
|
|
||||||
|
meta, body = load_knowledge(file_dir)
|
||||||
|
if not meta:
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"file_id: {fid}")
|
||||||
|
print(f" ERROR: knowledge.md not found or invalid")
|
||||||
|
continue
|
||||||
|
|
||||||
|
source_name = meta.get("source_name", "unknown")
|
||||||
|
# Check for knowledge.db
|
||||||
|
db_path = os.path.join(file_dir, "knowledge.db")
|
||||||
|
has_db = os.path.isfile(db_path)
|
||||||
|
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"file_id: {fid}")
|
||||||
|
print(f"source: {source_name}")
|
||||||
|
if has_db:
|
||||||
|
print(f"db_path: {db_path}")
|
||||||
|
|
||||||
|
sheets_meta = {s["id"]: s for s in meta.get("sheets", [])}
|
||||||
|
|
||||||
|
for sid in sheet_ids:
|
||||||
|
if sid is None:
|
||||||
|
# Show all sheets
|
||||||
|
target_sheets = list(sheets_meta.values())
|
||||||
|
elif sid in sheets_meta:
|
||||||
|
target_sheets = [sheets_meta[sid]]
|
||||||
|
else:
|
||||||
|
print(f"\n sheet {sid}: NOT FOUND in metadata")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for sheet in target_sheets:
|
||||||
|
sheet_id = sheet["id"]
|
||||||
|
sname = sheet.get("name", "?")
|
||||||
|
stype = sheet.get("type", "?")
|
||||||
|
print(f"\n --- {sheet_id}: {sname} [{stype}]")
|
||||||
|
_sheet_desc = str(sheet.get("description") or "").strip()
|
||||||
|
_block_titles = sheet.get("block_titles") or []
|
||||||
|
if _sheet_desc:
|
||||||
|
print(f" description: {_sheet_desc[:200]}")
|
||||||
|
elif _block_titles:
|
||||||
|
print(f" description (fallback from block_titles): {str(_block_titles[0])[:200]}")
|
||||||
|
elif sname and not str(sname).startswith("sheet_"):
|
||||||
|
print(f" description (fallback from sheet name): {sname}")
|
||||||
|
|
||||||
|
if stype == "db":
|
||||||
|
row_count = sheet.get("row_count", "?")
|
||||||
|
db_table = sheet.get("db_table", sheet_id)
|
||||||
|
print(f" table: {db_table}, rows: {row_count}")
|
||||||
|
columns = sheet.get("columns", [])
|
||||||
|
if columns:
|
||||||
|
print(f" columns ({len(columns)}):")
|
||||||
|
print(format_columns(columns))
|
||||||
|
else:
|
||||||
|
block_count = sheet.get("block_count", "?")
|
||||||
|
print(f" blocks: {block_count}")
|
||||||
|
|
||||||
|
# Show body section (notes + sample for db, content for markdown)
|
||||||
|
if body:
|
||||||
|
src_tag = extract_sheet_src(body, sheet_id) if body else ""
|
||||||
|
f_code = f_ref_map.get(fid, "")
|
||||||
|
if src_tag and f_code and "F0" in src_tag:
|
||||||
|
src_tag = src_tag.replace("F0S", f"{f_code}S")
|
||||||
|
section = extract_sheet_body(body, sheet_id)
|
||||||
|
if section:
|
||||||
|
# Truncate to ~2000 chars
|
||||||
|
if len(section) > 2000:
|
||||||
|
section = section[:2000] + "\n ... [truncated]"
|
||||||
|
if src_tag:
|
||||||
|
print(f' __src="{src_tag}"')
|
||||||
|
print(f" content:")
|
||||||
|
for line in section.split("\n"):
|
||||||
|
print(f" {line}")
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Done. Showed {len(file_sheets)} files.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
71
skills/onprem/kfs-answer/scripts/format_answer.py
Normal file
71
skills/onprem/kfs-answer/scripts/format_answer.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Read file_refs.txt (header) + answer_body.txt (agent answer) → write combined to final_answer.txt + print.
|
||||||
|
|
||||||
|
Usage: python3 format_answer.py
|
||||||
|
|
||||||
|
Reads from session directory:
|
||||||
|
./file_refs.txt — written by search.py (F1=uuid(name) format)
|
||||||
|
./answer_body.txt — written by agent (answer text with <CITATION> tags)
|
||||||
|
|
||||||
|
Writes: ./final_answer.txt (header + body combined, ready for cat)
|
||||||
|
Also prints to stdout for immediate visibility.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from _session import get_session_dir
|
||||||
|
|
||||||
|
BODY_FILE = "./answer_body.txt"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
REFS_FILE = os.path.join(get_session_dir(), "file_refs.txt")
|
||||||
|
# Read header from file_refs.txt
|
||||||
|
header_lines = []
|
||||||
|
if os.path.exists(REFS_FILE):
|
||||||
|
with open(REFS_FILE, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
# Convert F1=uuid(name) → F1 = uuid (name)
|
||||||
|
if "=" in line and line[0] == "F":
|
||||||
|
parts = line.split("=", 1)
|
||||||
|
ref = parts[0].strip()
|
||||||
|
rest = parts[1].strip()
|
||||||
|
header_lines.append(f"{ref} = {rest}")
|
||||||
|
else:
|
||||||
|
header_lines.append(line)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("WARNING: file_refs.txt not found, outputting answer without header", file=sys.stderr)
|
||||||
|
|
||||||
|
# Read answer body
|
||||||
|
if not os.path.exists(BODY_FILE):
|
||||||
|
print(f"ERROR: {BODY_FILE} not found. Write your answer to {BODY_FILE} first.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
with open(BODY_FILE, "r", encoding="utf-8") as f:
|
||||||
|
body = f.read().strip()
|
||||||
|
|
||||||
|
# Build combined output
|
||||||
|
parts = []
|
||||||
|
if header_lines:
|
||||||
|
parts.append("\n".join(header_lines))
|
||||||
|
parts.append("")
|
||||||
|
parts.append(body)
|
||||||
|
combined = "\n".join(parts)
|
||||||
|
|
||||||
|
# Write to file (agent will cat this)
|
||||||
|
with open("./final_answer.txt", "w", encoding="utf-8") as f:
|
||||||
|
f.write(combined)
|
||||||
|
|
||||||
|
# Also print for immediate visibility
|
||||||
|
print(f"Written to ./final_answer.txt ({len(combined)} chars)")
|
||||||
|
print()
|
||||||
|
print(combined)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
56
skills/onprem/kfs-answer/scripts/merge_citations.py
Normal file
56
skills/onprem/kfs-answer/scripts/merge_citations.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
"""Merge accumulated citation data into final CITATION tags.
|
||||||
|
|
||||||
|
Usage: python3 merge_citations.py
|
||||||
|
|
||||||
|
Reads {session_dir}/citations.jsonl (appended by query.py / query_db.py),
|
||||||
|
merges rows by (file, sheet), outputs one <CITATION .../> tag per combination.
|
||||||
|
|
||||||
|
Agent calls this ONCE before composing the final answer (Step 4).
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from _session import get_session_dir
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
citations_path = os.path.join(get_session_dir(), "citations.jsonl")
|
||||||
|
if not os.path.isfile(citations_path):
|
||||||
|
print("[NO CITATIONS]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Read all citation entries
|
||||||
|
groups = {} # (file, sheet) -> {"filename": str, "rows": set}
|
||||||
|
with open(citations_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
entry = json.loads(line)
|
||||||
|
key = (entry["file"], entry["sheet"])
|
||||||
|
if key not in groups:
|
||||||
|
groups[key] = {"filename": entry.get("filename", ""), "rows": set()}
|
||||||
|
for r in entry.get("rows", []):
|
||||||
|
groups[key]["rows"].add(r)
|
||||||
|
|
||||||
|
if not groups:
|
||||||
|
print("[NO CITATIONS]")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("[CITATIONS]")
|
||||||
|
for (file_id, sheet_num) in sorted(groups.keys()):
|
||||||
|
info = groups[(file_id, sheet_num)]
|
||||||
|
fn_attr = f' filename="{info["filename"]}"' if info["filename"] else ""
|
||||||
|
rows = sorted(info["rows"])
|
||||||
|
if rows:
|
||||||
|
rows_str = "[" + ", ".join(str(r) for r in rows) + "]"
|
||||||
|
print(f'<CITATION file="{file_id}"{fn_attr} sheet="{sheet_num}" rows="{rows_str}" />')
|
||||||
|
else:
|
||||||
|
# Sheet-level citation (markdown, no rows)
|
||||||
|
print(f'<CITATION file="{file_id}"{fn_attr} sheet="{sheet_num}" />')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
517
skills/onprem/kfs-answer/scripts/query.py
Normal file
517
skills/onprem/kfs-answer/scripts/query.py
Normal file
@ -0,0 +1,517 @@
|
|||||||
|
"""Budget-aware auto query for knowledge files.
|
||||||
|
|
||||||
|
Usage: python3 query.py <file_id1:sheet_id1>,... <question> <kw1> <kw2> ...
|
||||||
|
|
||||||
|
Keywords are separate positional arguments (not comma-separated).
|
||||||
|
|
||||||
|
For db-type sheets: keyword SQL with budget control (COUNT → sample → select columns → LIMIT).
|
||||||
|
For markdown-type sheets: keyword section matching within budget.
|
||||||
|
|
||||||
|
Output: TSV (or markdown section) followed by a `[CITATIONS]` block with pre-built
|
||||||
|
<CITATION file="F1" filename="..." sheet="N" rows="[...]" /> tags. The `__src`
|
||||||
|
column is consumed internally and stripped from visible output — agent should
|
||||||
|
preserve and place CITATION tags near the data they cite.
|
||||||
|
|
||||||
|
datasets directory: ./datasets/ (gbase-agent-service) or ./dataset/ (catalog-agent), auto-detected at runtime.
|
||||||
|
dataset_ids are discovered automatically from subdirectories under datasets directory.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from _session import get_session_dir
|
||||||
|
|
||||||
|
# Derive project root from script location: scripts/ → kfs-answer/ → skills/ → project root
|
||||||
|
_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||||
|
_ds = os.path.join(_PROJECT_ROOT, "datasets")
|
||||||
|
DATASETS_DIR = _ds if os.path.isdir(_ds) else os.path.join(_PROJECT_ROOT, "dataset")
|
||||||
|
|
||||||
|
|
||||||
|
def _discover_datasets():
|
||||||
|
"""Scan DATASETS_DIR for subdirectory names (each is a dataset_id)."""
|
||||||
|
if not os.path.isdir(DATASETS_DIR):
|
||||||
|
return []
|
||||||
|
return [d for d in sorted(os.listdir(DATASETS_DIR))
|
||||||
|
if os.path.isdir(os.path.join(DATASETS_DIR, d))]
|
||||||
|
|
||||||
|
|
||||||
|
def load_file_ref_map():
|
||||||
|
"""Load file_id → (F{n}, filename) mapping from file_refs.txt (in session dir)."""
|
||||||
|
refs_path = os.path.join(get_session_dir(), "file_refs.txt")
|
||||||
|
mapping = {} # file_id → (f_code, filename)
|
||||||
|
if not os.path.isfile(refs_path):
|
||||||
|
return mapping
|
||||||
|
ref_pat = re.compile(r"^(F\d+)=([0-9a-f-]+)\((.+?)\)\s*$")
|
||||||
|
with open(refs_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
m = ref_pat.match(line.strip())
|
||||||
|
if m:
|
||||||
|
mapping[m.group(2)] = (m.group(1), m.group(3))
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
# Row-level: cell value stored as `__src="F0S1R5"` (xls-agent-parse wraps it),
|
||||||
|
# so non-anchored + re.search() extracts the triple. Sheet-level comes from
|
||||||
|
# extract_sheet_src which returns bare `F0S1`, so that one stays anchored.
|
||||||
|
SRC_ROW_PAT = re.compile(r"F(\d+)S(\d+)R(\d+)")
|
||||||
|
SRC_SHEET_PAT = re.compile(r"^F(\d+)S(\d+)$")
|
||||||
|
|
||||||
|
|
||||||
|
def _format_citation(file_id, sheet_num, filename, row_nums=None):
|
||||||
|
"""Build one CITATION tag. file_id=UUID. row_nums=None → sheet-level (no rows attr)."""
|
||||||
|
fn_attr = f' filename="{filename}"' if filename else ""
|
||||||
|
if row_nums is None:
|
||||||
|
return f'<CITATION file="{file_id}"{fn_attr} sheet="{sheet_num}" />'
|
||||||
|
rows_str = "[" + ", ".join(str(r) for r in row_nums) + "]"
|
||||||
|
return f'<CITATION file="{file_code}"{fn_attr} sheet="{sheet_num}" rows="{rows_str}" />'
|
||||||
|
|
||||||
|
|
||||||
|
def replace_f0(text, f_code):
|
||||||
|
"""Replace F0 with assigned f_code (e.g., F1) in __src values."""
|
||||||
|
if not f_code or f_code == "F0":
|
||||||
|
return text
|
||||||
|
return text.replace('F0S', f'{f_code}S')
|
||||||
|
|
||||||
|
|
||||||
|
def extract_sheet_src(body, sheet_id):
|
||||||
|
"""Extract __src value from <!-- sheet_xxx __src="F0S1" --> marker. Returns empty string if not found."""
|
||||||
|
m = re.search(rf'<!--\s*{re.escape(sheet_id)}\s+__src="([^"]*)"', body)
|
||||||
|
return m.group(1) if m else ""
|
||||||
|
|
||||||
|
|
||||||
|
def calc_budget(entry_count):
|
||||||
|
"""Character budget per entry, scaled by total count."""
|
||||||
|
if entry_count <= 1:
|
||||||
|
return 3000
|
||||||
|
elif entry_count == 2:
|
||||||
|
return 1800
|
||||||
|
else:
|
||||||
|
return 1200
|
||||||
|
|
||||||
|
|
||||||
|
def find_file_dir(dataset_ids, file_id):
|
||||||
|
for dataset_id in dataset_ids:
|
||||||
|
candidate = os.path.join(DATASETS_DIR, dataset_id, file_id)
|
||||||
|
if os.path.isdir(candidate):
|
||||||
|
return candidate
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def load_knowledge_meta(file_dir):
|
||||||
|
km_path = os.path.join(file_dir, "knowledge.md")
|
||||||
|
if not os.path.isfile(km_path):
|
||||||
|
return None, None
|
||||||
|
with open(km_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
if not content.startswith("---"):
|
||||||
|
return None, None
|
||||||
|
parts = content.split("---", 2)
|
||||||
|
if len(parts) < 3:
|
||||||
|
return None, None
|
||||||
|
meta = yaml.safe_load(parts[1])
|
||||||
|
body = parts[2].strip()
|
||||||
|
return meta, body
|
||||||
|
|
||||||
|
|
||||||
|
def query_db_sheet(db_path, table_name, columns, keywords, budget):
|
||||||
|
"""Budget-aware SQLite query: COUNT → sample → select columns → LIMIT."""
|
||||||
|
if not os.path.isfile(db_path):
|
||||||
|
return {"error": f"DB not found: {db_path}"}
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
# Get actual table columns from DB
|
||||||
|
try:
|
||||||
|
cursor = conn.execute(f'PRAGMA table_info("{table_name}")')
|
||||||
|
db_cols = [r["name"] for r in cursor.fetchall()]
|
||||||
|
except Exception:
|
||||||
|
conn.close()
|
||||||
|
return {"error": f"Table not found: {table_name}"}
|
||||||
|
|
||||||
|
if not db_cols:
|
||||||
|
conn.close()
|
||||||
|
return {"error": f"Table has no columns: {table_name}"}
|
||||||
|
|
||||||
|
# Build WHERE clause from keywords — pick the MOST SELECTIVE keyword
|
||||||
|
where_clause = None
|
||||||
|
where_params = None
|
||||||
|
used_kw = None
|
||||||
|
|
||||||
|
valid_kws = [kw for kw in keywords if kw]
|
||||||
|
|
||||||
|
if valid_kws:
|
||||||
|
# Try each keyword individually, pick the one with smallest COUNT > 0
|
||||||
|
best_kw = None
|
||||||
|
best_count = float("inf")
|
||||||
|
best_wc = None
|
||||||
|
best_wp = None
|
||||||
|
for kw in valid_kws:
|
||||||
|
conditions = [f'"{c}" LIKE ?' for c in db_cols]
|
||||||
|
wc = " OR ".join(conditions)
|
||||||
|
wp = [f"%{kw}%"] * len(db_cols)
|
||||||
|
try:
|
||||||
|
cursor = conn.execute(f'SELECT COUNT(*) FROM "{table_name}" WHERE {wc}', wp)
|
||||||
|
cnt = cursor.fetchone()[0]
|
||||||
|
if 0 < cnt < best_count:
|
||||||
|
best_kw, best_count, best_wc, best_wp = kw, cnt, wc, wp
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if best_kw:
|
||||||
|
where_clause, where_params, used_kw = best_wc, best_wp, best_kw
|
||||||
|
|
||||||
|
# Fallback: all keywords OR (if no single keyword matched)
|
||||||
|
if where_clause is None and valid_kws:
|
||||||
|
conditions = []
|
||||||
|
params = []
|
||||||
|
for kw in valid_kws:
|
||||||
|
for c in db_cols:
|
||||||
|
conditions.append(f'"{c}" LIKE ?')
|
||||||
|
params.append(f"%{kw}%")
|
||||||
|
wc = " OR ".join(conditions)
|
||||||
|
try:
|
||||||
|
cursor = conn.execute(f'SELECT COUNT(*) FROM "{table_name}" WHERE {wc}', params)
|
||||||
|
cnt = cursor.fetchone()[0]
|
||||||
|
if cnt > 0:
|
||||||
|
where_clause, where_params = wc, params
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback: no keywords → get all
|
||||||
|
if where_clause is None and not valid_kws:
|
||||||
|
where_clause = "1=1"
|
||||||
|
where_params = []
|
||||||
|
|
||||||
|
if where_clause is None:
|
||||||
|
conn.close()
|
||||||
|
return {"rows": [], "total": 0, "note": "No keyword match"}
|
||||||
|
|
||||||
|
# COUNT total
|
||||||
|
try:
|
||||||
|
cursor = conn.execute(f'SELECT COUNT(*) FROM "{table_name}" WHERE {where_clause}', where_params)
|
||||||
|
total_rows = cursor.fetchone()[0]
|
||||||
|
except Exception:
|
||||||
|
conn.close()
|
||||||
|
return {"error": "COUNT failed"}
|
||||||
|
|
||||||
|
if total_rows == 0:
|
||||||
|
conn.close()
|
||||||
|
return {"rows": [], "total": 0}
|
||||||
|
|
||||||
|
# Sample 5 rows to estimate column widths
|
||||||
|
try:
|
||||||
|
cursor = conn.execute(f'SELECT * FROM "{table_name}" WHERE {where_clause} LIMIT 5', where_params)
|
||||||
|
sample_rows = [dict(r) for r in cursor.fetchall()]
|
||||||
|
except Exception:
|
||||||
|
conn.close()
|
||||||
|
return {"error": "Sample query failed"}
|
||||||
|
|
||||||
|
col_avg_chars = {}
|
||||||
|
for col in db_cols:
|
||||||
|
total_chars = sum(len(str(row.get(col, "") or "")) for row in sample_rows)
|
||||||
|
col_avg_chars[col] = total_chars / max(len(sample_rows), 1)
|
||||||
|
|
||||||
|
# Budget check: decide columns and LIMIT
|
||||||
|
header_overhead = len("\t".join(db_cols)) + 50
|
||||||
|
avg_row_chars = sum(col_avg_chars.values()) + len(db_cols)
|
||||||
|
estimated_total = header_overhead + total_rows * avg_row_chars
|
||||||
|
|
||||||
|
if estimated_total <= budget:
|
||||||
|
select_cols = db_cols
|
||||||
|
limit = total_rows
|
||||||
|
else:
|
||||||
|
# __src column always retained (source marker for citation)
|
||||||
|
# Prioritize keyword-hit columns, then fill remaining budget
|
||||||
|
must_cols = [c for c in db_cols if c == "__src"]
|
||||||
|
keyword_cols = []
|
||||||
|
other_cols = []
|
||||||
|
for col in db_cols:
|
||||||
|
if col == "__src":
|
||||||
|
continue
|
||||||
|
is_kw_col = any(
|
||||||
|
any(kw.lower() in str(row.get(col, "") or "").lower() for row in sample_rows)
|
||||||
|
for kw in keywords if kw
|
||||||
|
)
|
||||||
|
if is_kw_col:
|
||||||
|
keyword_cols.append(col)
|
||||||
|
else:
|
||||||
|
other_cols.append(col)
|
||||||
|
|
||||||
|
select_cols = must_cols + (keyword_cols[:] if keyword_cols else [])
|
||||||
|
|
||||||
|
def row_width(cols):
|
||||||
|
return sum(col_avg_chars.get(c, 10) for c in cols) + len(cols)
|
||||||
|
|
||||||
|
for col in other_cols:
|
||||||
|
test_cols = select_cols + [col]
|
||||||
|
if header_overhead + 5 * row_width(test_cols) <= budget:
|
||||||
|
select_cols.append(col)
|
||||||
|
|
||||||
|
if not select_cols:
|
||||||
|
select_cols = db_cols
|
||||||
|
|
||||||
|
rw = row_width(select_cols)
|
||||||
|
available = budget - header_overhead
|
||||||
|
limit = max(1, int(available / max(rw, 1)))
|
||||||
|
limit = min(limit, total_rows)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
cols_str = ", ".join(f'"{c}"' for c in select_cols)
|
||||||
|
sql = f'SELECT {cols_str} FROM "{table_name}" WHERE {where_clause} LIMIT {limit}'
|
||||||
|
try:
|
||||||
|
cursor = conn.execute(sql, where_params)
|
||||||
|
rows = [dict(r) for r in cursor.fetchall()]
|
||||||
|
except Exception:
|
||||||
|
conn.close()
|
||||||
|
return {"error": "Query execution failed"}
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"table": table_name,
|
||||||
|
"columns": select_cols,
|
||||||
|
"all_columns": db_cols,
|
||||||
|
"rows": rows,
|
||||||
|
"count": len(rows),
|
||||||
|
"total": total_rows,
|
||||||
|
"fields_reduced": len(select_cols) < len(db_cols),
|
||||||
|
"rows_limited": limit < total_rows,
|
||||||
|
"keyword": used_kw,
|
||||||
|
"db_path": db_path,
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def query_markdown_sheet(body, sheet_id, keywords, budget):
|
||||||
|
"""Keyword-based section matching for markdown sheets."""
|
||||||
|
parts = re.split(r"<!--\s*sheet_\w+(?:\s+[^>]*)?\s*-->", body)
|
||||||
|
markers = re.findall(r"<!--\s*(sheet_\w+)(?:\s+[^>]*)?\s*-->", body)
|
||||||
|
|
||||||
|
section = ""
|
||||||
|
for i, marker in enumerate(markers):
|
||||||
|
if marker == sheet_id and i < len(parts) - 1:
|
||||||
|
section = parts[i + 1].strip()
|
||||||
|
break
|
||||||
|
if not section and len(markers) == 0 and len(parts) == 1:
|
||||||
|
section = body.strip()
|
||||||
|
|
||||||
|
if not section:
|
||||||
|
return {"content": "", "note": "No content found"}
|
||||||
|
|
||||||
|
if len(section) <= budget:
|
||||||
|
return {"content": section, "full": True}
|
||||||
|
|
||||||
|
# Keyword-based line matching with context
|
||||||
|
lines = section.split("\n")
|
||||||
|
matched_indices = set()
|
||||||
|
context = 3
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
for kw in keywords:
|
||||||
|
if kw and kw.lower() in line.lower():
|
||||||
|
for j in range(max(0, i - context), min(len(lines), i + context + 1)):
|
||||||
|
matched_indices.add(j)
|
||||||
|
|
||||||
|
if not matched_indices:
|
||||||
|
truncated = section[:budget]
|
||||||
|
last_nl = truncated.rfind("\n")
|
||||||
|
if last_nl > budget * 0.7:
|
||||||
|
truncated = truncated[:last_nl]
|
||||||
|
return {"content": truncated, "note": f"[No keyword match. First {len(truncated)} chars of {len(section)}]"}
|
||||||
|
|
||||||
|
result_lines = []
|
||||||
|
chars = 0
|
||||||
|
prev = -2
|
||||||
|
for idx in sorted(matched_indices):
|
||||||
|
line = lines[idx]
|
||||||
|
line_chars = len(line) + 1
|
||||||
|
if chars + line_chars > budget:
|
||||||
|
break
|
||||||
|
if idx > prev + 1:
|
||||||
|
result_lines.append("---")
|
||||||
|
chars += 4
|
||||||
|
result_lines.append(line)
|
||||||
|
chars += line_chars
|
||||||
|
prev = idx
|
||||||
|
|
||||||
|
return {"content": "\n".join(result_lines), "matched_lines": len(result_lines)}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Auto-discover datasets from ./dataset/ or ./datasets/ subdirectories
|
||||||
|
dataset_ids = _discover_datasets()
|
||||||
|
raw_entries = [e.strip() for e in sys.argv[1].split(",") if e.strip()]
|
||||||
|
question = sys.argv[2] if len(sys.argv) > 2 else ""
|
||||||
|
keywords = sys.argv[3:] # remaining positional args are keywords
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
for entry in raw_entries:
|
||||||
|
if ":" in entry:
|
||||||
|
fid, sid = entry.split(":", 1)
|
||||||
|
entries.append((fid.strip(), sid.strip()))
|
||||||
|
else:
|
||||||
|
entries.append((entry.strip(), None))
|
||||||
|
|
||||||
|
per_entry_budget = calc_budget(len(entries))
|
||||||
|
print(f"[Budget: {per_entry_budget} chars/entry, {len(entries)} entries]")
|
||||||
|
|
||||||
|
# Load F0→F{n} mapping from search.py
|
||||||
|
f_ref_map = load_file_ref_map() # file_id → f_code
|
||||||
|
|
||||||
|
for fid, target_sheet_id in entries:
|
||||||
|
file_dir = find_file_dir(dataset_ids, fid)
|
||||||
|
if not file_dir:
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"file_id: {fid}")
|
||||||
|
print(f" ERROR: not found")
|
||||||
|
continue
|
||||||
|
|
||||||
|
meta, body = load_knowledge_meta(file_dir)
|
||||||
|
if not meta:
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"file_id: {fid}")
|
||||||
|
print(f" ERROR: knowledge.md invalid")
|
||||||
|
continue
|
||||||
|
|
||||||
|
source_name = meta.get("source_name", "unknown")
|
||||||
|
sheets_meta = {s["id"]: s for s in meta.get("sheets", [])}
|
||||||
|
|
||||||
|
if target_sheet_id and target_sheet_id in sheets_meta:
|
||||||
|
target_sheets = [(target_sheet_id, sheets_meta[target_sheet_id])]
|
||||||
|
elif target_sheet_id:
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"file_id: {fid}, sheet: {target_sheet_id}")
|
||||||
|
print(f" ERROR: sheet not found")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
target_sheets = list(sheets_meta.items())
|
||||||
|
|
||||||
|
for sid, sheet in target_sheets:
|
||||||
|
stype = sheet.get("type", "unknown")
|
||||||
|
sname = sheet.get("name", "?")
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"file_id: {fid} / {sid}: {sname} [{stype}]")
|
||||||
|
print(f"source: {source_name}")
|
||||||
|
|
||||||
|
if stype == "db":
|
||||||
|
db_path = os.path.join(file_dir, "knowledge.db")
|
||||||
|
db_table = sheet.get("db_table", sid)
|
||||||
|
print(f"db: {db_path}, table: {db_table}")
|
||||||
|
|
||||||
|
result = query_db_sheet(db_path, db_table, sheet.get("columns", []),
|
||||||
|
keywords, per_entry_budget)
|
||||||
|
if "error" in result:
|
||||||
|
print(f" ERROR: {result['error']}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not result.get("rows"):
|
||||||
|
print(f" No matching rows (total: {result.get('total', 0)})")
|
||||||
|
continue
|
||||||
|
|
||||||
|
budget_info = []
|
||||||
|
if result.get("fields_reduced"):
|
||||||
|
budget_info.append(f"fields: {len(result['columns'])}/{len(result['all_columns'])}")
|
||||||
|
if result.get("rows_limited"):
|
||||||
|
budget_info.append(f"rows: {result['count']}/{result['total']}")
|
||||||
|
budget_str = f" [BUDGET: {', '.join(budget_info)}]" if budget_info else ""
|
||||||
|
|
||||||
|
# Hide __src from both COLUMNS report and TSV display — it's consumed
|
||||||
|
# into CITATION tags below.
|
||||||
|
display_cols = [c for c in result["columns"] if c != "__src"]
|
||||||
|
biz_all_cols = [c for c in result["all_columns"] if c != "__src"]
|
||||||
|
omitted = [c for c in biz_all_cols if c not in display_cols]
|
||||||
|
|
||||||
|
print(f" TABLE: {result['table']} ({result['count']}/{result['total']} rows){budget_str}")
|
||||||
|
print(f" COLUMNS: {', '.join(display_cols)}")
|
||||||
|
if omitted:
|
||||||
|
print(f" OMITTED: {len(omitted)} columns")
|
||||||
|
if result.get("keyword"):
|
||||||
|
print(f" KEYWORD: {result['keyword']}")
|
||||||
|
# Warn if keyword filtering returned suspiciously few rows
|
||||||
|
if result.get("keyword") and result["count"] <= 3 and result["total"] >= 10:
|
||||||
|
print(f" ⚠ NOTE: keyword \"{result['keyword']}\" matched only {result['count']}/{result['total']} rows. Results may be incomplete — consider removing this keyword.")
|
||||||
|
print("-" * 40)
|
||||||
|
# TSV output — __src stripped, collected into src_groups for CITATIONS.
|
||||||
|
f_entry = f_ref_map.get(fid, ("", ""))
|
||||||
|
f_code, filename = f_entry
|
||||||
|
cols = result["columns"]
|
||||||
|
has_src = "__src" in cols
|
||||||
|
print("\t".join(display_cols))
|
||||||
|
|
||||||
|
src_groups = {} # (file_code, sheet_num) -> set of row_nums
|
||||||
|
for row in result["rows"]:
|
||||||
|
if has_src:
|
||||||
|
raw_src = row.get("__src")
|
||||||
|
if raw_src:
|
||||||
|
src_val = str(raw_src)
|
||||||
|
if f_code:
|
||||||
|
src_val = replace_f0(src_val, f_code)
|
||||||
|
m = SRC_ROW_PAT.search(src_val)
|
||||||
|
if m:
|
||||||
|
file_code = f"F{m.group(1)}"
|
||||||
|
sheet_num = int(m.group(2))
|
||||||
|
row_num = int(m.group(3))
|
||||||
|
src_groups.setdefault((file_code, sheet_num), set()).add(row_num)
|
||||||
|
vals = []
|
||||||
|
for c in display_cols:
|
||||||
|
v = row.get(c)
|
||||||
|
s = "" if v is None else str(v)
|
||||||
|
if len(s) > 200:
|
||||||
|
s = s[:200] + "..."
|
||||||
|
vals.append(s)
|
||||||
|
print("\t".join(vals))
|
||||||
|
|
||||||
|
if src_groups:
|
||||||
|
import json
|
||||||
|
citations_path = os.path.join(get_session_dir(), "citations.jsonl")
|
||||||
|
with open(citations_path, "a", encoding="utf-8") as cf:
|
||||||
|
for (file_code, sheet_num) in sorted(src_groups.keys()):
|
||||||
|
row_nums = sorted(src_groups[(file_code, sheet_num)])
|
||||||
|
cf.write(json.dumps({
|
||||||
|
"file": fid, "filename": filename,
|
||||||
|
"sheet": sheet_num, "rows": row_nums,
|
||||||
|
"source": "query",
|
||||||
|
}, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
elif stype == "markdown":
|
||||||
|
if not body:
|
||||||
|
print(f" ERROR: no body content")
|
||||||
|
continue
|
||||||
|
src_tag = extract_sheet_src(body, sid) if body else ""
|
||||||
|
f_entry = f_ref_map.get(fid, ("", ""))
|
||||||
|
f_code, filename = f_entry
|
||||||
|
if src_tag and f_code:
|
||||||
|
src_tag = replace_f0(src_tag, f_code)
|
||||||
|
result = query_markdown_sheet(body, sid, keywords, per_entry_budget)
|
||||||
|
if result.get("note"):
|
||||||
|
print(f" {result['note']}")
|
||||||
|
print("-" * 40)
|
||||||
|
print(result.get("content", ""))
|
||||||
|
# Write sheet-level citation to session file (no stdout).
|
||||||
|
if src_tag:
|
||||||
|
m = SRC_SHEET_PAT.match(src_tag)
|
||||||
|
if m:
|
||||||
|
import json
|
||||||
|
sheet_num = int(m.group(2))
|
||||||
|
citations_path = os.path.join(get_session_dir(), "citations.jsonl")
|
||||||
|
with open(citations_path, "a", encoding="utf-8") as cf:
|
||||||
|
cf.write(json.dumps({
|
||||||
|
"file": fid, "filename": filename,
|
||||||
|
"sheet": sheet_num, "rows": [],
|
||||||
|
"source": "query",
|
||||||
|
}, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f" ERROR: unknown type '{stype}'")
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Done. Queried {len(entries)} entries.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
442
skills/onprem/kfs-answer/scripts/query_db.py
Normal file
442
skills/onprem/kfs-answer/scripts/query_db.py
Normal file
@ -0,0 +1,442 @@
|
|||||||
|
"""Execute custom SQL on a knowledge.db with auto-pagination.
|
||||||
|
|
||||||
|
Usage: python3 query_db.py <db_path> <sql> [--offset N]
|
||||||
|
|
||||||
|
Auto-fixes common LLM SQL issues before execution:
|
||||||
|
- Fullwidth punctuation → ASCII
|
||||||
|
- Unquoted identifiers → double-quoted (matched against actual DB columns/tables)
|
||||||
|
|
||||||
|
Output: TSV format with header + status line at the end.
|
||||||
|
|
||||||
|
Status line (one of three):
|
||||||
|
[RESULT: N/N rows returned | COMPLETE]
|
||||||
|
→ All data returned in this call. Proceed to answer.
|
||||||
|
|
||||||
|
[RESULT: K/total returned | this batch: rows X-Y (offset A-B) | PARTIAL — call again with --offset=M]
|
||||||
|
→ Output size limit reached. Re-invoke with SAME <db_path> and <SQL>, adding `--offset M`.
|
||||||
|
Keep calling until you see COMPLETE.
|
||||||
|
|
||||||
|
[RESULT: 0 rows | EMPTY]
|
||||||
|
→ Query matched no rows.
|
||||||
|
|
||||||
|
Key rules:
|
||||||
|
- --offset is a COMMAND-LINE argument, NOT a SQL clause.
|
||||||
|
- Keep the SQL string identical across pagination calls. Do NOT add LIMIT/OFFSET to the SQL
|
||||||
|
for output control (pagination is automatic). You MAY use SQL LIMIT when the question
|
||||||
|
genuinely requires it (e.g. "top 10 by X").
|
||||||
|
|
||||||
|
Example:
|
||||||
|
python3 query_db.py ./dataset/abc/xyz/knowledge.db "SELECT * FROM sheet_001 WHERE col LIKE '%x%'"
|
||||||
|
python3 query_db.py ./dataset/abc/xyz/knowledge.db "SELECT * FROM sheet_001 WHERE col LIKE '%x%'" --offset=11
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from _session import get_session_dir
|
||||||
|
|
||||||
|
MAX_OUTPUT_CHARS = 3000
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_fullwidth(sql):
|
||||||
|
"""Replace fullwidth punctuation with ASCII equivalents."""
|
||||||
|
sql = sql.replace("\uff0c", ",") # , → ,
|
||||||
|
sql = sql.replace("\u3000", " ") # fullwidth space → space
|
||||||
|
sql = sql.replace("\uff08", "(") # ( → (
|
||||||
|
sql = sql.replace("\uff09", ")") # ) → )
|
||||||
|
sql = sql.replace("\u2018", "'") # ' → '
|
||||||
|
sql = sql.replace("\u2019", "'") # ' → '
|
||||||
|
sql = sql.replace("\u201c", '"') # " → "
|
||||||
|
sql = sql.replace("\u201d", '"') # " → "
|
||||||
|
return sql
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_width(s):
|
||||||
|
"""Normalize fullwidth digits/letters to halfwidth for comparison."""
|
||||||
|
result = []
|
||||||
|
for ch in s:
|
||||||
|
cp = ord(ch)
|
||||||
|
# Fullwidth digits 0-9 (0xFF10-0xFF19) → 0-9
|
||||||
|
if 0xFF10 <= cp <= 0xFF19:
|
||||||
|
result.append(chr(cp - 0xFF10 + ord('0')))
|
||||||
|
# Fullwidth uppercase A-Z (0xFF21-0xFF3A) → A-Z
|
||||||
|
elif 0xFF21 <= cp <= 0xFF3A:
|
||||||
|
result.append(chr(cp - 0xFF21 + ord('A')))
|
||||||
|
# Fullwidth lowercase a-z (0xFF41-0xFF5A) → a-z
|
||||||
|
elif 0xFF41 <= cp <= 0xFF5A:
|
||||||
|
result.append(chr(cp - 0xFF41 + ord('a')))
|
||||||
|
else:
|
||||||
|
result.append(ch)
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_quoted_identifiers(sql, conn):
|
||||||
|
"""Fix double-quoted identifiers that don't match DB columns due to fullwidth/halfwidth mismatch.
|
||||||
|
|
||||||
|
SQLite treats unmatched "identifier" as a string literal, causing silent wrong results.
|
||||||
|
This function replaces quoted strings with the exact DB column name when they match after normalization.
|
||||||
|
"""
|
||||||
|
identifiers = set()
|
||||||
|
try:
|
||||||
|
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
||||||
|
tables = [r[0] for r in cursor.fetchall()]
|
||||||
|
for table in tables:
|
||||||
|
identifiers.add(table)
|
||||||
|
cursor = conn.execute(f'PRAGMA table_info("{table}")')
|
||||||
|
for col_info in cursor.fetchall():
|
||||||
|
identifiers.add(col_info[1])
|
||||||
|
except Exception:
|
||||||
|
return sql
|
||||||
|
|
||||||
|
# Build normalized lookup: normalized_form → exact_name
|
||||||
|
norm_lookup = {}
|
||||||
|
for ident in identifiers:
|
||||||
|
norm = _normalize_width(ident).replace('\u3000', ' ').strip()
|
||||||
|
norm_lookup[norm] = ident
|
||||||
|
|
||||||
|
# Find all "quoted strings" in SQL and fix mismatches
|
||||||
|
def replace_quoted(match):
|
||||||
|
quoted = match.group(1)
|
||||||
|
# Already exact match — no fix needed
|
||||||
|
if quoted in identifiers:
|
||||||
|
return f'"{quoted}"'
|
||||||
|
# Try normalized match
|
||||||
|
norm = _normalize_width(quoted).replace('\u3000', ' ').strip()
|
||||||
|
if norm in norm_lookup:
|
||||||
|
return f'"{norm_lookup[norm]}"'
|
||||||
|
return match.group(0)
|
||||||
|
|
||||||
|
sql = re.sub(r'"([^"]+)"', replace_quoted, sql)
|
||||||
|
return sql
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_identifiers(sql, conn):
|
||||||
|
"""Auto-quote unquoted table/column identifiers using actual DB schema.
|
||||||
|
|
||||||
|
Handles two LLM issues:
|
||||||
|
1. Unquoted identifiers: `SELECT 営業担当 FROM sheet_001` → `SELECT "営業担当" FROM "sheet_001"`
|
||||||
|
2. Inserted spaces at CJK/ASCII boundary: `貴社ご注文 NO` → `"貴社ご注文NO"`
|
||||||
|
(qwen tokenizer splits CJK and ASCII, generating spaces between them)
|
||||||
|
"""
|
||||||
|
# Collect all table names and column names from the database
|
||||||
|
identifiers = set()
|
||||||
|
try:
|
||||||
|
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
||||||
|
tables = [r[0] for r in cursor.fetchall()]
|
||||||
|
for table in tables:
|
||||||
|
identifiers.add(table)
|
||||||
|
cursor = conn.execute(f'PRAGMA table_info("{table}")')
|
||||||
|
for col_info in cursor.fetchall():
|
||||||
|
identifiers.add(col_info[1]) # column name
|
||||||
|
except Exception:
|
||||||
|
return sql
|
||||||
|
|
||||||
|
SQL_KEYWORDS = {
|
||||||
|
'SELECT', 'FROM', 'WHERE', 'AND', 'OR', 'NOT', 'IN', 'LIKE', 'ORDER',
|
||||||
|
'BY', 'GROUP', 'HAVING', 'LIMIT', 'OFFSET', 'AS', 'ON', 'JOIN', 'LEFT',
|
||||||
|
'RIGHT', 'INNER', 'OUTER', 'UNION', 'ALL', 'DISTINCT', 'COUNT', 'SUM',
|
||||||
|
'AVG', 'MIN', 'MAX', 'DESC', 'ASC', 'NULL', 'IS', 'BETWEEN', 'EXISTS',
|
||||||
|
'CASE', 'WHEN', 'THEN', 'ELSE', 'END', 'CREATE', 'TABLE', 'INSERT',
|
||||||
|
'INTO', 'VALUES', 'UPDATE', 'SET', 'DELETE', 'DROP', 'ALTER', 'INDEX',
|
||||||
|
'TEMP', 'TEMPORARY', 'IF', 'REPLACE', 'SUBSTR', 'COLLATE', 'NOCASE',
|
||||||
|
'CAST', 'INTEGER', 'TEXT', 'REAL',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sort by length descending so longer identifiers are matched first
|
||||||
|
sorted_ids = sorted(identifiers, key=len, reverse=True)
|
||||||
|
|
||||||
|
for ident in sorted_ids:
|
||||||
|
if not ident or len(ident) < 2:
|
||||||
|
continue
|
||||||
|
if ident.upper() in SQL_KEYWORDS:
|
||||||
|
continue
|
||||||
|
if f'"{ident}"' in sql:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build a flexible pattern that allows:
|
||||||
|
# 1. Optional spaces at CJK/ASCII boundaries (qwen tokenizer splits)
|
||||||
|
# 2. Fullwidth/halfwidth bracket interchange ()↔()
|
||||||
|
# e.g., "年次(西暦)" matches "年次(西暦)" or "年次(西暦)" or "年次 (西暦)"
|
||||||
|
BRACKET_MAP = {'(': '[((]', ')': '[))]', '(': '[((]', ')': '[))]'}
|
||||||
|
flex_chars = []
|
||||||
|
for i, ch in enumerate(ident):
|
||||||
|
if ch in BRACKET_MAP:
|
||||||
|
flex_chars.append(BRACKET_MAP[ch])
|
||||||
|
else:
|
||||||
|
flex_chars.append(re.escape(ch))
|
||||||
|
if i < len(ident) - 1:
|
||||||
|
curr_cjk = ord(ch) > 0x2E80
|
||||||
|
next_cjk = ord(ident[i + 1]) > 0x2E80
|
||||||
|
if curr_cjk != next_cjk:
|
||||||
|
flex_chars.append(r'\s*') # optional whitespace at boundary
|
||||||
|
flex_pattern = ''.join(flex_chars)
|
||||||
|
|
||||||
|
# Match only unquoted occurrences
|
||||||
|
pattern = r'(?<!")(' + flex_pattern + r')(?!")'
|
||||||
|
sql = re.sub(pattern, f'"{ident}"', sql)
|
||||||
|
|
||||||
|
return sql
|
||||||
|
|
||||||
|
|
||||||
|
def _load_file_ref(db_path):
|
||||||
|
"""Return (f_code, file_id, filename) for this db file from file_refs.txt. ("","","") if not found."""
|
||||||
|
parts = db_path.replace("\\", "/").split("/")
|
||||||
|
file_id = ""
|
||||||
|
for i, p in enumerate(parts):
|
||||||
|
if p == "knowledge.db" and i >= 1:
|
||||||
|
file_id = parts[i - 1]
|
||||||
|
break
|
||||||
|
if not file_id:
|
||||||
|
return ("", "", "")
|
||||||
|
refs_path = os.path.join(get_session_dir(), "file_refs.txt")
|
||||||
|
if not os.path.isfile(refs_path):
|
||||||
|
return ("", file_id, "")
|
||||||
|
ref_pat = re.compile(r"^(F\d+)=([0-9a-f-]+)\((.+?)\)\s*$")
|
||||||
|
with open(refs_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
m = ref_pat.match(line.strip())
|
||||||
|
if m and m.group(2) == file_id:
|
||||||
|
return (m.group(1), file_id, m.group(3))
|
||||||
|
return ("", file_id, "")
|
||||||
|
|
||||||
|
|
||||||
|
def _purge_query_citations(db_path, sql):
|
||||||
|
"""Remove stale query.py citation entries for this (file, sheet) from citations.jsonl.
|
||||||
|
|
||||||
|
Called unconditionally when query_db.py runs — ensures query.py's
|
||||||
|
partial/stale data never survives into merge_citations.py output.
|
||||||
|
Purges by (file_id, sheet) so other sheets' query.py data is preserved.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
_, file_id, _ = _load_file_ref(db_path)
|
||||||
|
if not file_id:
|
||||||
|
return
|
||||||
|
citations_path = os.path.join(get_session_dir(), "citations.jsonl")
|
||||||
|
if not os.path.isfile(citations_path):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Determine target sheet from SQL table name → DB __src
|
||||||
|
sheet_num = None
|
||||||
|
m = re.search(r'FROM\s+"?(\w+)"?', sql, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
table_name = m.group(1)
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.execute(f'SELECT __src FROM "{table_name}" LIMIT 1')
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
if row and row[0]:
|
||||||
|
sm = re.search(r'S(\d+)', str(row[0]))
|
||||||
|
if sm:
|
||||||
|
sheet_num = int(sm.group(1))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
keep = []
|
||||||
|
with open(citations_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entry = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
if (entry.get("file") == file_id
|
||||||
|
and entry.get("source") == "query"
|
||||||
|
and (sheet_num is None or entry.get("sheet") == sheet_num)):
|
||||||
|
continue # discard this (file, sheet) query.py entry
|
||||||
|
keep.append(line + "\n")
|
||||||
|
with open(citations_path, "w", encoding="utf-8") as f:
|
||||||
|
f.writelines(keep)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_offset(args):
|
||||||
|
"""Parse --offset N or --offset=N from remaining args. Returns (offset, ok, error_msg)."""
|
||||||
|
offset = 0
|
||||||
|
i = 0
|
||||||
|
while i < len(args):
|
||||||
|
a = args[i]
|
||||||
|
if a == "--offset" and i + 1 < len(args):
|
||||||
|
try:
|
||||||
|
offset = int(args[i + 1])
|
||||||
|
except ValueError:
|
||||||
|
return 0, False, f"invalid --offset value: {args[i + 1]}"
|
||||||
|
i += 2
|
||||||
|
elif a.startswith("--offset="):
|
||||||
|
try:
|
||||||
|
offset = int(a.split("=", 1)[1])
|
||||||
|
except ValueError:
|
||||||
|
return 0, False, f"invalid --offset value: {a}"
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
i += 1
|
||||||
|
if offset < 0:
|
||||||
|
offset = 0
|
||||||
|
return offset, True, ""
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Usage: python3 query_db.py <db_path> <sql> [--offset N]")
|
||||||
|
return
|
||||||
|
|
||||||
|
db_path = sys.argv[1]
|
||||||
|
sql = sys.argv[2]
|
||||||
|
|
||||||
|
offset, ok, err = _parse_offset(sys.argv[3:])
|
||||||
|
if not ok:
|
||||||
|
print(f"[RESULT: 0 rows | {err}]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Immediately purge stale query.py citations for this (file, sheet).
|
||||||
|
# Must run unconditionally — even if this query returns 0 rows or errors out.
|
||||||
|
_purge_query_citations(db_path, sql)
|
||||||
|
|
||||||
|
# Step 1: Fix fullwidth punctuation
|
||||||
|
sql = _fix_fullwidth(sql)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
# Step 2: Auto-quote identifiers using actual DB schema
|
||||||
|
sql = _fix_identifiers(sql, conn)
|
||||||
|
|
||||||
|
# Step 3: Fix quoted identifiers with fullwidth/halfwidth mismatch
|
||||||
|
sql = _fix_quoted_identifiers(sql, conn)
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor = conn.execute(sql)
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"SQL ERROR: {e}")
|
||||||
|
print(f"SQL (after auto-fix): {sql}")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
total = len(rows)
|
||||||
|
|
||||||
|
# Empty result
|
||||||
|
if total == 0:
|
||||||
|
print("[RESULT: 0 rows | EMPTY]")
|
||||||
|
print(f"SQL: {sql}")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
# offset out of range
|
||||||
|
if offset >= total:
|
||||||
|
print(f"[RESULT: 0 rows | offset {offset} exceeds total {total} | call again with --offset=0]")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
columns = list(rows[0].keys())
|
||||||
|
has_src = "__src" in columns
|
||||||
|
output_columns = [c for c in columns if c != "__src"]
|
||||||
|
n_cols = max(len(output_columns), 1)
|
||||||
|
|
||||||
|
# Load (F{n}, file_id UUID, filename) for this db — used to pre-build CITATION tags.
|
||||||
|
f_code, file_id, filename = _load_file_ref(db_path)
|
||||||
|
|
||||||
|
# Dynamic cell-level truncation:
|
||||||
|
# budget per cell = MAX_OUTPUT / n_cols / 2, clamped to [100, 500].
|
||||||
|
# Prevents single wide row from consuming entire budget when many columns.
|
||||||
|
cell_max = max(100, min(500, MAX_OUTPUT_CHARS // n_cols // 2))
|
||||||
|
|
||||||
|
# Build output (TSV rows, __src excluded)
|
||||||
|
lines = []
|
||||||
|
lines.append("\t".join(output_columns))
|
||||||
|
lines.append("-" * 40)
|
||||||
|
|
||||||
|
total_chars = sum(len(l) + 1 for l in lines)
|
||||||
|
shown_rows = 0 # rows output in THIS batch
|
||||||
|
|
||||||
|
# Collect __src groups for CITATION tags — only for rows actually emitted.
|
||||||
|
# Cell value is stored as `__src="F0S1R5"` (xls-agent-parse wraps it), so use
|
||||||
|
# search() with a non-anchored pattern to extract the F/S/R triple.
|
||||||
|
SRC_PAT = re.compile(r"F(\d+)S(\d+)R(\d+)")
|
||||||
|
src_groups = {} # (file_code, sheet_num) -> set of row_nums
|
||||||
|
|
||||||
|
for idx in range(offset, total):
|
||||||
|
row = rows[idx]
|
||||||
|
vals = []
|
||||||
|
for c in output_columns:
|
||||||
|
v = row[c]
|
||||||
|
s = "" if v is None else str(v)
|
||||||
|
# Cell-level truncation (dynamic based on column count)
|
||||||
|
if len(s) > cell_max:
|
||||||
|
s = s[:cell_max] + "..."
|
||||||
|
vals.append(s)
|
||||||
|
line = "\t".join(vals)
|
||||||
|
line_chars = len(line) + 1
|
||||||
|
|
||||||
|
# Force at least 1 row to guarantee progress (even if that single row exceeds budget)
|
||||||
|
if shown_rows > 0 and total_chars + line_chars > MAX_OUTPUT_CHARS:
|
||||||
|
break
|
||||||
|
|
||||||
|
lines.append(line)
|
||||||
|
total_chars += line_chars
|
||||||
|
shown_rows += 1
|
||||||
|
|
||||||
|
# Record __src for this emitted row
|
||||||
|
if has_src:
|
||||||
|
raw_src = row["__src"]
|
||||||
|
if raw_src is not None:
|
||||||
|
src_val = str(raw_src)
|
||||||
|
if f_code and "F0S" in src_val:
|
||||||
|
src_val = src_val.replace("F0S", f"{f_code}S", 1)
|
||||||
|
m = SRC_PAT.search(src_val)
|
||||||
|
if m:
|
||||||
|
file_code = f"F{m.group(1)}"
|
||||||
|
sheet_num = int(m.group(2))
|
||||||
|
row_num = int(m.group(3))
|
||||||
|
src_groups.setdefault((file_code, sheet_num), set()).add(row_num)
|
||||||
|
|
||||||
|
# Write citation data to session citations.jsonl (merge_citations.py reads it later).
|
||||||
|
# No [CITATIONS] block in stdout — LLM only sees final merged tags from merge_citations.py.
|
||||||
|
if src_groups:
|
||||||
|
import json
|
||||||
|
citations_path = os.path.join(get_session_dir(), "citations.jsonl")
|
||||||
|
fid_val = file_id if file_id else ""
|
||||||
|
with open(citations_path, "a", encoding="utf-8") as cf:
|
||||||
|
for (file_code, sheet_num) in sorted(src_groups.keys()):
|
||||||
|
row_nums = sorted(src_groups[(file_code, sheet_num)])
|
||||||
|
cf.write(json.dumps({
|
||||||
|
"file": fid_val if fid_val else file_code,
|
||||||
|
"filename": filename,
|
||||||
|
"sheet": sheet_num, "rows": row_nums,
|
||||||
|
"source": "query_db",
|
||||||
|
}, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
# Build status line
|
||||||
|
next_offset = offset + shown_rows
|
||||||
|
batch_start_row = offset + 1 # human-readable, 1-indexed
|
||||||
|
batch_end_row = offset + shown_rows
|
||||||
|
batch_offset_start = offset
|
||||||
|
batch_offset_end = offset + shown_rows - 1
|
||||||
|
|
||||||
|
lines.append("")
|
||||||
|
if next_offset >= total:
|
||||||
|
if offset == 0:
|
||||||
|
# Single-call, all data
|
||||||
|
lines.append(f"[RESULT: {total}/{total} rows returned | COMPLETE]")
|
||||||
|
else:
|
||||||
|
# Final batch in a paginated sequence
|
||||||
|
lines.append(
|
||||||
|
f"[RESULT: {total}/{total} returned | this batch: rows {batch_start_row}-{batch_end_row} "
|
||||||
|
f"(offset {batch_offset_start}-{batch_offset_end}) | COMPLETE]"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# More data remaining
|
||||||
|
lines.append(
|
||||||
|
f"[RESULT: {next_offset}/{total} returned | this batch: rows {batch_start_row}-{batch_end_row} "
|
||||||
|
f"(offset {batch_offset_start}-{batch_offset_end}) | PARTIAL — call again with --offset={next_offset}]"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n".join(lines))
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
240
skills/onprem/kfs-answer/scripts/search.py
Normal file
240
skills/onprem/kfs-answer/scripts/search.py
Normal file
@ -0,0 +1,240 @@
|
|||||||
|
"""search_v2 — adds:
|
||||||
|
R_a: expose L1 (truncated)
|
||||||
|
R_b: expose per-sheet description with fallback to name/block_titles; cap 5 sheets per file
|
||||||
|
R_c: secondary sort by rare-keyword hits (rare = appears in <30 files)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from _session import get_session_dir
|
||||||
|
|
||||||
|
# Derive project root from script location: scripts/ → kfs-answer/ → skills/ → project root
|
||||||
|
_PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||||
|
_ds = os.path.join(_PROJECT_ROOT, "datasets")
|
||||||
|
DATASETS_DIR = _ds if os.path.isdir(_ds) else os.path.join(_PROJECT_ROOT, "dataset")
|
||||||
|
RARE_THRESHOLD = 30 # keyword appearing in < N files counts as "rare"
|
||||||
|
L1_MAX_CHARS = 300
|
||||||
|
DESC_MAX_CHARS = 80
|
||||||
|
SHEET_CAP = 5 # per file
|
||||||
|
|
||||||
|
|
||||||
|
def _discover_datasets():
|
||||||
|
"""Scan DATASETS_DIR for subdirectory names (each is a dataset_id)."""
|
||||||
|
if not os.path.isdir(DATASETS_DIR):
|
||||||
|
return []
|
||||||
|
return [d for d in sorted(os.listdir(DATASETS_DIR))
|
||||||
|
if os.path.isdir(os.path.join(DATASETS_DIR, d))]
|
||||||
|
|
||||||
|
|
||||||
|
def load_knowledge_files(dataset_ids):
|
||||||
|
entries = []
|
||||||
|
for dataset_id in dataset_ids:
|
||||||
|
dataset_dir = os.path.join(DATASETS_DIR, dataset_id)
|
||||||
|
if not os.path.isdir(dataset_dir):
|
||||||
|
continue
|
||||||
|
for file_id in sorted(os.listdir(dataset_dir)):
|
||||||
|
file_dir = os.path.join(dataset_dir, file_id)
|
||||||
|
if not os.path.isdir(file_dir):
|
||||||
|
continue
|
||||||
|
km_path = os.path.join(file_dir, "knowledge.md")
|
||||||
|
if not os.path.isfile(km_path):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
with open(km_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
if not content.startswith("---"):
|
||||||
|
continue
|
||||||
|
parts = content.split("---", 2)
|
||||||
|
if len(parts) < 3:
|
||||||
|
continue
|
||||||
|
meta = yaml.safe_load(parts[1])
|
||||||
|
if not meta:
|
||||||
|
continue
|
||||||
|
entries.append({"dataset_id": dataset_id, "file_id": file_id, "meta": meta})
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def build_searchable_text(entry):
|
||||||
|
meta = entry["meta"]
|
||||||
|
parts = [
|
||||||
|
str(meta.get("L0", "")),
|
||||||
|
str(meta.get("L1", "")),
|
||||||
|
str(meta.get("source_name", "")),
|
||||||
|
]
|
||||||
|
for sheet in meta.get("sheets", []):
|
||||||
|
parts.append(str(sheet.get("name", "")))
|
||||||
|
parts.append(str(sheet.get("description", "")))
|
||||||
|
for col in sheet.get("columns", []):
|
||||||
|
parts.append(str(col.get("name", "")))
|
||||||
|
parts.append(str(col.get("description", "")))
|
||||||
|
for title in (sheet.get("block_titles") or []):
|
||||||
|
parts.append(str(title))
|
||||||
|
return " ".join(parts).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def score_entry(text, keywords):
|
||||||
|
"""Primary score = hit_ratio (same as v1)."""
|
||||||
|
hits = sum(1 for k in keywords if k.lower() in text)
|
||||||
|
return round(hits / max(len(keywords), 1), 3)
|
||||||
|
|
||||||
|
|
||||||
|
def rare_hits(text, keywords, rare_set):
|
||||||
|
"""Secondary score = # rare keywords that hit."""
|
||||||
|
return sum(1 for k in keywords if k.lower() in text and k.lower() in rare_set)
|
||||||
|
|
||||||
|
|
||||||
|
MIN_KW_LEN_FOR_DATA_SCAN = 2 # skip single-char keywords to avoid noise
|
||||||
|
|
||||||
|
|
||||||
|
def data_scan_hits(entry, keywords):
|
||||||
|
"""Scan knowledge.db row data for keyword matches. Returns # keywords found in any row.
|
||||||
|
|
||||||
|
Lightweight: one LIKE query per (table, keyword). Skips keywords < MIN_KW_LEN chars.
|
||||||
|
"""
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
db_path = os.path.join(DATASETS_DIR, entry["dataset_id"], entry["file_id"], "knowledge.db")
|
||||||
|
if not os.path.isfile(db_path):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
scan_kws = [k for k in keywords if len(k) >= MIN_KW_LEN_FOR_DATA_SCAN]
|
||||||
|
if not scan_kws:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
tables = [r[0] for r in conn.execute(
|
||||||
|
"SELECT name FROM sqlite_master WHERE type='table'").fetchall()]
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
hits = set()
|
||||||
|
for kw in scan_kws:
|
||||||
|
for table in tables:
|
||||||
|
try:
|
||||||
|
cols = [r[1] for r in conn.execute(f'PRAGMA table_info("{table}")').fetchall()
|
||||||
|
if r[1] != "__src"]
|
||||||
|
if not cols:
|
||||||
|
continue
|
||||||
|
where = " OR ".join(f'"{c}" LIKE ?' for c in cols)
|
||||||
|
params = [f"%{kw}%"] * len(cols)
|
||||||
|
cursor = conn.execute(f'SELECT COUNT(*) FROM "{table}" WHERE {where}', params)
|
||||||
|
if cursor.fetchone()[0] > 0:
|
||||||
|
hits.add(kw)
|
||||||
|
break # this kw found, no need to check other tables
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
conn.close()
|
||||||
|
return len(hits)
|
||||||
|
|
||||||
|
|
||||||
|
def sheet_label(sheet):
|
||||||
|
"""R_b: description → name → block_titles[0] → '(untitled)'"""
|
||||||
|
desc = str(sheet.get("description") or "").strip()
|
||||||
|
if desc:
|
||||||
|
return desc[:DESC_MAX_CHARS]
|
||||||
|
name = str(sheet.get("name") or "").strip()
|
||||||
|
if name and not name.startswith("sheet_"):
|
||||||
|
return name
|
||||||
|
titles = sheet.get("block_titles") or []
|
||||||
|
if titles:
|
||||||
|
return str(titles[0])[:DESC_MAX_CHARS]
|
||||||
|
return name or "(untitled)"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Auto-discover datasets from ./dataset/ or ./datasets/ subdirectories
|
||||||
|
dataset_ids = _discover_datasets()
|
||||||
|
query = sys.argv[1] if len(sys.argv) > 1 else ""
|
||||||
|
keywords = sys.argv[2:]
|
||||||
|
top_n = 20
|
||||||
|
|
||||||
|
entries = load_knowledge_files(dataset_ids)
|
||||||
|
if not entries:
|
||||||
|
print("NO_MATCH")
|
||||||
|
return
|
||||||
|
|
||||||
|
# R_c: pre-compute which keywords are "rare" across all entries
|
||||||
|
texts = [(e, build_searchable_text(e)) for e in entries]
|
||||||
|
doc_freq = Counter()
|
||||||
|
for _, text in texts:
|
||||||
|
for k in set(kw.lower() for kw in keywords):
|
||||||
|
if k in text:
|
||||||
|
doc_freq[k] += 1
|
||||||
|
rare_set = {k for k, n in doc_freq.items() if n < RARE_THRESHOLD}
|
||||||
|
|
||||||
|
# Score: metadata text match + row data scan
|
||||||
|
scored = []
|
||||||
|
for entry, text in texts:
|
||||||
|
s = score_entry(text, keywords)
|
||||||
|
r = rare_hits(text, keywords, rare_set)
|
||||||
|
d = data_scan_hits(entry, keywords)
|
||||||
|
scored.append({**entry, "score": s, "rare_hits": r, "data_hits": d})
|
||||||
|
|
||||||
|
# Primary by score, secondary by data_hits, tertiary by rare_hits
|
||||||
|
scored.sort(key=lambda x: (-x["score"], -x["data_hits"], -x["rare_hits"]))
|
||||||
|
|
||||||
|
matched = [s for s in scored if s["score"] > 0 or s["data_hits"] > 0]
|
||||||
|
results = (matched or scored)[:top_n]
|
||||||
|
|
||||||
|
pairs = []
|
||||||
|
seen = set()
|
||||||
|
for r in results:
|
||||||
|
# Cap sheets per file in RECOMMENDED too (keeps output budget sane)
|
||||||
|
file_sheets = r["meta"].get("sheets", [])[:SHEET_CAP]
|
||||||
|
for sheet in file_sheets:
|
||||||
|
pair = f"{r['file_id']}:{sheet['id']}"
|
||||||
|
if pair not in seen:
|
||||||
|
seen.add(pair)
|
||||||
|
pairs.append(pair)
|
||||||
|
|
||||||
|
note = " (keyword matched)" if matched else " (no keyword match, showing all)"
|
||||||
|
print(f"Total files: {len(entries)}, Returned: {len(results)}{note}")
|
||||||
|
print()
|
||||||
|
print(f"RECOMMENDED: {','.join(pairs)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# file_refs.txt — written to per-session dir (isolated by TRACE_ID)
|
||||||
|
refs_path = os.path.join(get_session_dir(), "file_refs.txt")
|
||||||
|
with open(refs_path, "w", encoding="utf-8") as f:
|
||||||
|
for idx, r in enumerate(results):
|
||||||
|
f_code = f"F{idx + 1}"
|
||||||
|
source_name = r["meta"].get("source_name", "unknown")
|
||||||
|
print(f'FILE_REF: {f_code}={r["file_id"]}({source_name})')
|
||||||
|
f.write(f'{f_code}={r["file_id"]}({source_name})\n')
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Per file summary — R_a adds L1, R_b adds per-sheet label, cap sheets
|
||||||
|
for r in results:
|
||||||
|
meta = r["meta"]
|
||||||
|
source_name = meta.get("source_name", "unknown")
|
||||||
|
l0 = (meta.get("L0", "") or "").replace("\n", " ")[:150]
|
||||||
|
l1 = (meta.get("L1", "") or "").replace("\n", " ")[:L1_MAX_CHARS]
|
||||||
|
sheets = meta.get("sheets", [])
|
||||||
|
shown = sheets[:SHEET_CAP]
|
||||||
|
extra = len(sheets) - len(shown)
|
||||||
|
|
||||||
|
print(f" {r['file_id']} (score:{r['score']},rare:{r['rare_hits']}) {source_name}")
|
||||||
|
if l0:
|
||||||
|
print(f" L0: {l0}")
|
||||||
|
if l1:
|
||||||
|
print(f" L1: {l1}")
|
||||||
|
for s in shown:
|
||||||
|
name = s.get("name", "?")
|
||||||
|
type_ = s.get("type", "?")
|
||||||
|
rc = s.get("row_count", s.get("block_count", "?"))
|
||||||
|
label = sheet_label(s)
|
||||||
|
print(f" - {name}[{type_},{rc}]: {label}")
|
||||||
|
if extra > 0:
|
||||||
|
print(f" - ... ({extra} more sheets omitted)")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user