`classify_column`

Public callable
Classify one column using term matching, metadata cues, and business context.
Parameters:
Name	Type	Description	Default
`column_name`	`Any`	Value used by this callable.	required
`data_type`	`Any`	Value used by this callable.	`None`
`profile`	`Any`	Value used by this callable.	`None`
`metadata`	`Any`	Value used by this callable.	`None`
`business_context`	`Any`	Value used by this callable.	`None`
`rules`	`Any`	Value used by this callable.	`None`
Returns:
Type	Description
`dict`	Structured output produced by this callable.
Source code in src/fabricops_kit/governance.py
def classify_column(
    column_name: str,
    data_type: str | None = None,
    profile: dict | None = None,
    metadata: dict | None = None,
    business_context: str | dict | None = None,
    rules: list[dict] | None = None,
) -> dict:
    """Classify one column using term matching, metadata cues, and business context.

        Parameters
        ----------
        column_name : Any
            Value used by this callable.
        data_type : Any
            Value used by this callable.
        profile : Any
            Value used by this callable.
        metadata : Any
            Value used by this callable.
        business_context : Any
            Value used by this callable.
        rules : Any
            Value used by this callable.

        Returns
        -------
        dict
            Structured output produced by this callable.
    """
    profile = profile or {}
    metadata = metadata or {}
    text_parts = [column_name, str(metadata.get("description") or ""), str(metadata.get("business_term") or "")]
    if isinstance(business_context, str):
        text_parts.append(business_context)
    elif isinstance(business_context, dict):
        text_parts.extend(str(v) for v in business_context.values())
    text_blob = " ".join(p for p in text_parts if p)

    matched_terms: list[str] = []
    inferred_semantic_type = str(profile.get("inferred_semantic_type") or "").lower()
    matched_rule_ids: list[str] = []
    profile_signals: dict[str, Any] = {}
    business_signals: list[str] = []

    best = {"classification": "unknown", "confidence": 0.2, "reason": "No governance signal detected", "action": "review"}

    semantic_map = {
        "email": ("contact", 0.82, "Profile semantic type indicates contact data"),
        "phone": ("contact", 0.8, "Profile semantic type indicates contact data"),
        "person_name": ("personal_data", 0.8, "Profile semantic type indicates personal data"),
        "identifier": ("identifier", 0.85, "Profile semantic type indicates identifier"),
        "amount": ("financial", 0.8, "Profile semantic type indicates financial data"),
        "free_text": ("sensitive_free_text", 0.72, "Profile semantic type indicates free text"),
    }
    if inferred_semantic_type in semantic_map:
        cls, conf, reason = semantic_map[inferred_semantic_type]
        best = {"classification": cls, "confidence": conf, "reason": reason, "action": DEFAULT_ACTION_BY_CLASSIFICATION.get(cls, "review")}

    for category, terms in DEFAULT_CLASSIFICATION_TERMS.items():
        matches = _match_terms(text_blob, terms)
        if matches:
            conf = 0.75 if category in {"identifier", "contact", "financial", "health"} else 0.7
            if category == "sensitive_free_text":
                conf = 0.68
            if conf > best["confidence"]:
                best = {"classification": category, "confidence": conf, "reason": f"Name/metadata matched {category} terms", "action": DEFAULT_ACTION_BY_CLASSIFICATION.get(category, "review")}
            matched_terms.extend(matches)

    avg_len = profile.get("avg_length") or profile.get("average_length")
    distinct_pct = profile.get("distinct_pct")
    if isinstance(avg_len, (int, float)) and avg_len >= 80:
        profile_signals["long_text"] = True
        if best["classification"] == "sensitive_free_text":
            best["confidence"] = max(best["confidence"], 0.82)
    if isinstance(distinct_pct, (int, float)) and distinct_pct >= 95:
        profile_signals["high_uniqueness"] = True
        if best["classification"] in {"identifier", "contact"}:
            best["confidence"] = max(best["confidence"], 0.9)

    for rule in rules or []:
        patterns = [str(p).lower() for p in (rule.get("patterns") or [])]
        if any(_phrase_in_text(p, text_blob) for p in patterns):
            matched_rule_ids.append(str(rule.get("rule_id") or ""))
            if (rule.get("confidence") or 0) >= best["confidence"]:
                best = {
                    "classification": str(rule.get("classification") or best["classification"]),
                    "confidence": float(rule.get("confidence") or best["confidence"]),
                    "reason": str(rule.get("reason") or "Matched custom governance rule"),
                    "action": str(rule.get("action") or DEFAULT_ACTION_BY_CLASSIFICATION.get(str(rule.get("classification") or "unknown"), "review")),
                }

    for token in ["student", "staff", "hr", "payroll", "medical"]:
        if token in text_blob.lower():
            business_signals.append(token)

    return {
        "column_name": column_name,
        "data_type": data_type,
        "suggested_classification": best["classification"],
        "confidence": max(0.0, min(1.0, float(best["confidence"]))),
        "reason": best["reason"],
        "evidence": {
            "matched_terms": sorted(set(matched_terms)),
            "matched_rule_ids": [rid for rid in matched_rule_ids if rid],
            "data_type": data_type,
            "profile_signals": to_jsonable(profile_signals),
            "business_context_signals": sorted(set(business_signals)),
        },
        "suggested_action": best["action"],
        "status": "suggested",
        "generated_by": "framework",
        "approved_by": None,
        "approved_at": None,
    }