Source code for disclosure_alpha.text_metrics

import re
from dataclasses import dataclass

from disclosure_alpha.dictionaries import (
    BOILERPLATE_PHRASES,
    CONSTRAINING_WORDS,
    FLAG_PATTERNS,
    FLAG_SECTION_SCOPE,
    FLAG_SUPPRESSIONS,
    GEOGRAPHY_TERMS,
    LEGAL_REGULATORY_PHRASES,
    LITIGIOUS_WORDS,
    MDNA_DENSITY_TERMS,
    MDNA_SECTIONS,
    MODAL_WORDS,
    MODERATE_MODAL_WORDS,
    NEGATIVE_WORDS,
    SEGMENT_TERMS,
    STRONG_MODAL_WORDS,
    UNCERTAINTY_WORDS,
    WEAK_MODAL_WORDS,
)
from disclosure_alpha.text_matching import (
    boilerplate_hits,
    phrase_count,
    phrase_matches,
    split_sentences,
    tokenize_words,
)

__all__ = [
    "SectionTextInput",
    "TextMetricResult",
    "METRIC_FAMILIES",
    "tokenize_words",
    "compute_text_metrics",
    "compute_metric_families",
    "detect_section_flags",
    "compute_density_metrics",
]


[docs] @dataclass class SectionTextInput: section_name: str cleaned_text: str
[docs] @dataclass class TextMetricResult: word_count: int sentence_count: int average_sentence_length: float readability_score: float | None negative_word_ratio: float uncertainty_word_ratio: float litigious_word_ratio: float constraining_word_ratio: float modal_word_ratio: float weak_modal_word_ratio: float moderate_modal_word_ratio: float strong_modal_word_ratio: float legal_regulatory_phrase_ratio: float numeric_specificity_score: float company_specificity_score: float boilerplate_phrase_ratio: float
def _tokenize(text: str) -> list[str]: return tokenize_words(text) def _count_sentences(text: str) -> int: sentences = split_sentences(text) return max(1, len(sentences)) if sentences else 0 def _word_ratio(words: list[str], vocab: frozenset[str]) -> float: if not words: return 0.0 hits = sum(1 for w in words if w in vocab) return hits / len(words) def _phrase_ratio(lower: str, phrases: list[str], word_count: int) -> float: if not word_count: return 0.0 hits = sum(phrase_count(lower, phrase) for phrase in phrases) return hits / word_count
[docs] def compute_text_metrics(inp: SectionTextInput) -> TextMetricResult: text = inp.cleaned_text or "" words = _tokenize(text) word_count = len(words) sentence_count = _count_sentences(text) avg_sentence_len = word_count / sentence_count if sentence_count else 0.0 long_words = sum(1 for w in words if len(w) > 6) long_word_pct = (long_words / word_count) if word_count else 0.0 readability = min(100.0, avg_sentence_len * 2 + long_word_pct * 100) numeric_tokens = len(re.findall(r"\b\d+(?:[.,]\d+)?%?\b", text)) numeric_specificity = min(100.0, (numeric_tokens / word_count * 1000) if word_count else 0.0) lower = text.lower() capitalized = len(re.findall(r"\b[A-Z][a-z]+\b", text)) geo_hits = sum(1 for g in GEOGRAPHY_TERMS if phrase_matches(lower, g)) segment_hits = sum(1 for s in SEGMENT_TERMS if phrase_matches(lower, s)) company_specificity = min( 100.0, ((capitalized + numeric_tokens + geo_hits + segment_hits) / word_count * 100) if word_count else 0.0, ) bp_hits = boilerplate_hits(text, BOILERPLATE_PHRASES) boilerplate_ratio = min(1.0, bp_hits / max(1, sentence_count)) return TextMetricResult( word_count=word_count, sentence_count=sentence_count, average_sentence_length=round(avg_sentence_len, 4), readability_score=round(readability, 4), negative_word_ratio=round(_word_ratio(words, NEGATIVE_WORDS), 6), uncertainty_word_ratio=round(_word_ratio(words, UNCERTAINTY_WORDS), 6), litigious_word_ratio=round(_word_ratio(words, LITIGIOUS_WORDS), 6), constraining_word_ratio=round(_word_ratio(words, CONSTRAINING_WORDS), 6), modal_word_ratio=round(_word_ratio(words, MODAL_WORDS), 6), weak_modal_word_ratio=round(_word_ratio(words, WEAK_MODAL_WORDS), 6), moderate_modal_word_ratio=round(_word_ratio(words, MODERATE_MODAL_WORDS), 6), strong_modal_word_ratio=round(_word_ratio(words, STRONG_MODAL_WORDS), 6), legal_regulatory_phrase_ratio=round( _phrase_ratio(lower, LEGAL_REGULATORY_PHRASES, word_count), 6 ), numeric_specificity_score=round(numeric_specificity, 4), company_specificity_score=round(company_specificity, 4), boilerplate_phrase_ratio=round(boilerplate_ratio, 6), )
METRIC_FAMILIES = ("tone", "specificity", "boilerplate", "liquidity", "internal_controls")
[docs] def compute_metric_families(inp: SectionTextInput) -> list[dict[str, float | str]]: """Return metric family rows (tone, specificity, boilerplate, liquidity, internal_controls) with raw and normalized values.""" base = compute_text_metrics(inp) return [ {"metric_family": "tone", "metric_name": "negative_word_ratio", "raw_value": base.negative_word_ratio, "normalized_value": base.negative_word_ratio * 100}, {"metric_family": "tone", "metric_name": "uncertainty_word_ratio", "raw_value": base.uncertainty_word_ratio, "normalized_value": base.uncertainty_word_ratio * 100}, {"metric_family": "specificity", "metric_name": "numeric_specificity_score", "raw_value": base.numeric_specificity_score, "normalized_value": base.numeric_specificity_score}, {"metric_family": "specificity", "metric_name": "company_specificity_score", "raw_value": base.company_specificity_score, "normalized_value": base.company_specificity_score}, {"metric_family": "boilerplate", "metric_name": "boilerplate_phrase_ratio", "raw_value": base.boilerplate_phrase_ratio, "normalized_value": base.boilerplate_phrase_ratio * 100}, {"metric_family": "liquidity", "metric_name": "constraining_word_ratio", "raw_value": base.constraining_word_ratio, "normalized_value": base.constraining_word_ratio * 100}, {"metric_family": "internal_controls", "metric_name": "modal_word_ratio", "raw_value": base.modal_word_ratio, "normalized_value": base.modal_word_ratio * 100}, ]
[docs] def detect_section_flags(text: str, section_name: str) -> dict[str, bool]: """Return all v1 boolean flags for a section (False when out of scope).""" sentences = split_sentences(text or "") flags: dict[str, bool] = {} for flag_name, phrases in FLAG_PATTERNS.items(): scope = FLAG_SECTION_SCOPE.get(flag_name, frozenset()) if section_name not in scope: flags[flag_name] = False continue suppressions = FLAG_SUPPRESSIONS.get(flag_name, []) matched = False for sent in sentences: lower = sent.lower() if not any(phrase_matches(lower, phrase) for phrase in phrases): continue if suppressions and any(phrase_matches(lower, sup) for sup in suppressions): continue matched = True break flags[flag_name] = matched return flags
[docs] def compute_density_metrics(text: str, section_name: str) -> dict[str, float]: """MD&A keyword density: hits per 1000 words, capped 0–100.""" if section_name not in MDNA_SECTIONS: return {name: 0.0 for name in MDNA_DENSITY_TERMS} words = _tokenize(text or "") word_count = max(1, len(words)) lower = (text or "").lower() densities: dict[str, float] = {} for name, terms in MDNA_DENSITY_TERMS.items(): hits = sum(phrase_count(lower, term) for term in terms) raw = hits / word_count * 1000 densities[name] = round(min(100.0, raw), 4) return densities