"""Return full schema + sample data for specified file:sheet pairs. Usage: python3 detail.py ,,... Output: Per sheet — columns with type/stats/description + sample rows (from knowledge.md body). datasets directory: ./datasets/ (gbase-agent-service) or ./dataset/ (catalog-agent), auto-detected at runtime. dataset_ids are discovered automatically from subdirectories under datasets directory. """ import os import re import sys import yaml sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from _session import get_session_dir # Derive project root from script location: scripts/ → kfs-answer/ → skills/ → project root _PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) _ds = os.path.join(_PROJECT_ROOT, "datasets") DATASETS_DIR = _ds if os.path.isdir(_ds) else os.path.join(_PROJECT_ROOT, "dataset") def _discover_datasets(): """Scan DATASETS_DIR for subdirectory names (each is a dataset_id).""" if not os.path.isdir(DATASETS_DIR): return [] return [d for d in sorted(os.listdir(DATASETS_DIR)) if os.path.isdir(os.path.join(DATASETS_DIR, d))] def load_file_ref_map(): """Load file_id → F{n} mapping from file_refs.txt (in session dir).""" refs_path = os.path.join(get_session_dir(), "file_refs.txt") mapping = {} if not os.path.isfile(refs_path): return mapping ref_pat = re.compile(r"^(F\d+)=([0-9a-f-]+)\(") with open(refs_path, "r", encoding="utf-8") as f: for line in f: m = ref_pat.match(line.strip()) if m: mapping[m.group(2)] = m.group(1) return mapping def find_file_dir(dataset_ids, file_id): for dataset_id in dataset_ids: candidate = os.path.join(DATASETS_DIR, dataset_id, file_id) if os.path.isdir(candidate): return candidate return None def load_knowledge(file_dir): """Parse knowledge.md → (meta dict, body text).""" km_path = os.path.join(file_dir, "knowledge.md") if not os.path.isfile(km_path): return None, None with open(km_path, "r", encoding="utf-8") as f: content = f.read() if not content.startswith("---"): return None, None parts = content.split("---", 2) if len(parts) < 3: return None, None meta = yaml.safe_load(parts[1]) body = parts[2].strip() return meta, body def extract_sheet_body(body, sheet_id): """Extract body section for a specific sheet (delimited by ).""" parts = re.split(r"", body) markers = re.findall(r"", body) for i, marker in enumerate(markers): if marker == sheet_id and i < len(parts) - 1: return parts[i + 1].strip() # Fallback: if only one sheet and no markers, return entire body if len(markers) == 0 and len(parts) == 1: return body.strip() return "" def extract_sheet_src(body, sheet_id): """Extract __src value from marker. Returns empty string if not found.""" m = re.search(rf'