Extractor | Cursor
def __init__(self, schema: Dict[str, str]): self.schema = schema # field -> regex pattern self.results = []
import re import json from pathlib import Path from typing import Dict, Any class CursorExtractor: """Hybrid regex + placeholder for AI refinement"""
extractor.save("extractor/output/structured_logs.json") Cursor Extractor
Extract from the selected log file: - Timestamp (ISO format) - Error level (ERROR/WARN/INFO) - Message summary (max 50 chars) - Component name Return as JSON array.
extractor = CursorExtractor(schema) for log_file in Path("data/raw/logs").glob("*.log"): content = log_file.read_text() extractor.extract_from_text(content, str(log_file)) def __init__(self, schema: Dict[str, str]): self
inside Cursor Composer today: “Extract all email addresses and dates from the selected text. Output JSON.”
That’s your first extraction. From there, build your own extractor library. From there, build your own extractor library
def save(self, output_path: str): with open(output_path, 'w') as f: json.dump(self.results, f, indent=2) schema = "timestamp": r"(\d4-\d2-\d2T\d2:\d2:\d2.\d+Z)", "request_id": r"RequestId: ([a-f0-9-]+)", "duration_ms": r"Duration: (\d+.\d+) ms", "memory_mb": r"MemorySize: (\d+) MB"