import re
from dataclasses import dataclass
from disclosure_alpha.dictionaries import (
BOILERPLATE_PHRASES,
CONSTRAINING_WORDS,
FLAG_PATTERNS,
FLAG_SECTION_SCOPE,
FLAG_SUPPRESSIONS,
GEOGRAPHY_TERMS,
LEGAL_REGULATORY_PHRASES,
LITIGIOUS_WORDS,
MDNA_DENSITY_TERMS,
MDNA_SECTIONS,
MODAL_WORDS,
MODERATE_MODAL_WORDS,
NEGATIVE_WORDS,
SEGMENT_TERMS,
STRONG_MODAL_WORDS,
UNCERTAINTY_WORDS,
WEAK_MODAL_WORDS,
)
from disclosure_alpha.text_matching import (
boilerplate_hits,
phrase_count,
phrase_matches,
split_sentences,
tokenize_words,
)
__all__ = [
"SectionTextInput",
"TextMetricResult",
"METRIC_FAMILIES",
"tokenize_words",
"compute_text_metrics",
"compute_metric_families",
"detect_section_flags",
"compute_density_metrics",
]
[docs]
@dataclass
class SectionTextInput:
section_name: str
cleaned_text: str
[docs]
@dataclass
class TextMetricResult:
word_count: int
sentence_count: int
average_sentence_length: float
readability_score: float | None
negative_word_ratio: float
uncertainty_word_ratio: float
litigious_word_ratio: float
constraining_word_ratio: float
modal_word_ratio: float
weak_modal_word_ratio: float
moderate_modal_word_ratio: float
strong_modal_word_ratio: float
legal_regulatory_phrase_ratio: float
numeric_specificity_score: float
company_specificity_score: float
boilerplate_phrase_ratio: float
def _tokenize(text: str) -> list[str]:
return tokenize_words(text)
def _count_sentences(text: str) -> int:
sentences = split_sentences(text)
return max(1, len(sentences)) if sentences else 0
def _word_ratio(words: list[str], vocab: frozenset[str]) -> float:
if not words:
return 0.0
hits = sum(1 for w in words if w in vocab)
return hits / len(words)
def _phrase_ratio(lower: str, phrases: list[str], word_count: int) -> float:
if not word_count:
return 0.0
hits = sum(phrase_count(lower, phrase) for phrase in phrases)
return hits / word_count
[docs]
def compute_text_metrics(inp: SectionTextInput) -> TextMetricResult:
text = inp.cleaned_text or ""
words = _tokenize(text)
word_count = len(words)
sentence_count = _count_sentences(text)
avg_sentence_len = word_count / sentence_count if sentence_count else 0.0
long_words = sum(1 for w in words if len(w) > 6)
long_word_pct = (long_words / word_count) if word_count else 0.0
readability = min(100.0, avg_sentence_len * 2 + long_word_pct * 100)
numeric_tokens = len(re.findall(r"\b\d+(?:[.,]\d+)?%?\b", text))
numeric_specificity = min(100.0, (numeric_tokens / word_count * 1000) if word_count else 0.0)
lower = text.lower()
capitalized = len(re.findall(r"\b[A-Z][a-z]+\b", text))
geo_hits = sum(1 for g in GEOGRAPHY_TERMS if phrase_matches(lower, g))
segment_hits = sum(1 for s in SEGMENT_TERMS if phrase_matches(lower, s))
company_specificity = min(
100.0,
((capitalized + numeric_tokens + geo_hits + segment_hits) / word_count * 100)
if word_count
else 0.0,
)
bp_hits = boilerplate_hits(text, BOILERPLATE_PHRASES)
boilerplate_ratio = min(1.0, bp_hits / max(1, sentence_count))
return TextMetricResult(
word_count=word_count,
sentence_count=sentence_count,
average_sentence_length=round(avg_sentence_len, 4),
readability_score=round(readability, 4),
negative_word_ratio=round(_word_ratio(words, NEGATIVE_WORDS), 6),
uncertainty_word_ratio=round(_word_ratio(words, UNCERTAINTY_WORDS), 6),
litigious_word_ratio=round(_word_ratio(words, LITIGIOUS_WORDS), 6),
constraining_word_ratio=round(_word_ratio(words, CONSTRAINING_WORDS), 6),
modal_word_ratio=round(_word_ratio(words, MODAL_WORDS), 6),
weak_modal_word_ratio=round(_word_ratio(words, WEAK_MODAL_WORDS), 6),
moderate_modal_word_ratio=round(_word_ratio(words, MODERATE_MODAL_WORDS), 6),
strong_modal_word_ratio=round(_word_ratio(words, STRONG_MODAL_WORDS), 6),
legal_regulatory_phrase_ratio=round(
_phrase_ratio(lower, LEGAL_REGULATORY_PHRASES, word_count), 6
),
numeric_specificity_score=round(numeric_specificity, 4),
company_specificity_score=round(company_specificity, 4),
boilerplate_phrase_ratio=round(boilerplate_ratio, 6),
)
METRIC_FAMILIES = ("tone", "specificity", "boilerplate", "liquidity", "internal_controls")
[docs]
def compute_metric_families(inp: SectionTextInput) -> list[dict[str, float | str]]:
"""Return metric family rows (tone, specificity, boilerplate, liquidity, internal_controls) with raw and normalized values."""
base = compute_text_metrics(inp)
return [
{"metric_family": "tone", "metric_name": "negative_word_ratio", "raw_value": base.negative_word_ratio, "normalized_value": base.negative_word_ratio * 100},
{"metric_family": "tone", "metric_name": "uncertainty_word_ratio", "raw_value": base.uncertainty_word_ratio, "normalized_value": base.uncertainty_word_ratio * 100},
{"metric_family": "specificity", "metric_name": "numeric_specificity_score", "raw_value": base.numeric_specificity_score, "normalized_value": base.numeric_specificity_score},
{"metric_family": "specificity", "metric_name": "company_specificity_score", "raw_value": base.company_specificity_score, "normalized_value": base.company_specificity_score},
{"metric_family": "boilerplate", "metric_name": "boilerplate_phrase_ratio", "raw_value": base.boilerplate_phrase_ratio, "normalized_value": base.boilerplate_phrase_ratio * 100},
{"metric_family": "liquidity", "metric_name": "constraining_word_ratio", "raw_value": base.constraining_word_ratio, "normalized_value": base.constraining_word_ratio * 100},
{"metric_family": "internal_controls", "metric_name": "modal_word_ratio", "raw_value": base.modal_word_ratio, "normalized_value": base.modal_word_ratio * 100},
]
[docs]
def detect_section_flags(text: str, section_name: str) -> dict[str, bool]:
"""Return all v1 boolean flags for a section (False when out of scope)."""
sentences = split_sentences(text or "")
flags: dict[str, bool] = {}
for flag_name, phrases in FLAG_PATTERNS.items():
scope = FLAG_SECTION_SCOPE.get(flag_name, frozenset())
if section_name not in scope:
flags[flag_name] = False
continue
suppressions = FLAG_SUPPRESSIONS.get(flag_name, [])
matched = False
for sent in sentences:
lower = sent.lower()
if not any(phrase_matches(lower, phrase) for phrase in phrases):
continue
if suppressions and any(phrase_matches(lower, sup) for sup in suppressions):
continue
matched = True
break
flags[flag_name] = matched
return flags
[docs]
def compute_density_metrics(text: str, section_name: str) -> dict[str, float]:
"""MD&A keyword density: hits per 1000 words, capped 0–100."""
if section_name not in MDNA_SECTIONS:
return {name: 0.0 for name in MDNA_DENSITY_TERMS}
words = _tokenize(text or "")
word_count = max(1, len(words))
lower = (text or "").lower()
densities: dict[str, float] = {}
for name, terms in MDNA_DENSITY_TERMS.items():
hits = sum(phrase_count(lower, term) for term in terms)
raw = hits / word_count * 1000
densities[name] = round(min(100.0, raw), 4)
return densities