#!/usr/bin/python3 -su

## Copyright (C) 2025 - 2025 ENCRYPTED SUPPORT LLC <adrelanos@whonix.org>
## See the file COPYING for copying conditions.

"""
This script scans input text or files for non-ASCII and suspicious Unicode characters.
It prints lines with suspicious characters annotated inline (e.g., [U+XXXX]).
For each such character, it prints the Unicode codepoint, name, and category.

Exit codes:
  0 - No suspicious Unicode found
  1 - Suspicious Unicode found
  2 - Error (e.g., file I/O or decoding error)
"""

import sys
import unicodedata
import string
import io
import os

USE_COLOR = (
    not os.getenv("NOCOLOR") and
    os.getenv("TERM") != "dumb" and
    sys.stdout.isatty()
)

RED = "\033[91m"
CYAN = "\033[96m"
RESET = "\033[0m"

VISIBLE_ASCII_RANGE = range(0x20, 0x7F)
ALLOWED_WHITESPACE = {'\n', '\r', '\t'}
SAFE_ASCII_SEMANTIC = set(string.ascii_letters + string.digits + string.punctuation + " \n\r\t")

def colorize(text, color):
    return f"{color}{text}{RESET}" if USE_COLOR else text

def is_suspicious(c):
    codepoint_allowed = ord(c) in VISIBLE_ASCII_RANGE or c in ALLOWED_WHITESPACE
    semantically_allowed = c in SAFE_ASCII_SEMANTIC
    ## Purposeful redundancy for extra safety.
    return not (codepoint_allowed and semantically_allowed)

def describe_char(c):
    """Return a description of a Unicode character including codepoint, name, and category."""
    code = ord(c)
    name = unicodedata.name(c, "<unnamed>")
    cat = unicodedata.category(c)

    codepoint_allowed = code in VISIBLE_ASCII_RANGE or c in ALLOWED_WHITESPACE
    semantically_allowed = c in SAFE_ASCII_SEMANTIC

    ## Purposeful redundancy for extra safety in character display.
    if codepoint_allowed and semantically_allowed:
        display = c
    else:
        display = repr(c)

    desc = f"{display} (U+{code:04X}, {name}, {cat})"
    return colorize(desc, CYAN)

def scan_line(line, lineno=None, filename=None):
    """Scan a single line for suspicious characters, print annotated line and character info."""

    annotated = ""
    has_suspicious = False
    prefix = f"{filename or '<stdin>'}:{lineno}: "
    suspicious_descrs = []

    for c in line:
        if is_suspicious(c):
            has_suspicious = True
            code = f"[U+{ord(c):04X}]"
            annotated += colorize(code, RED)
            suspicious_descrs.append(f"   -> {describe_char(c)}")
        else:
            annotated += c

    if annotated and annotated[-1] == "\n":
        annotated = annotated[:-1]

    annotated_stripped = annotated.rstrip()
    ## Trailing whitespaces are suspicious.
    ## https://forums.whonix.org/t/detecting-malicious-unicode-in-source-code-and-pull-requests/13754/28
    if len(annotated) != len(annotated_stripped):
        annotated_new = annotated_stripped
        has_suspicious = True
        for c in annotated[len(annotated_stripped):]:
            code = f"[U+{ord(c):04X}]"
            annotated_new += colorize(code, RED)
            suspicious_descrs.append(f"   -> {describe_char(c)}")
        annotated = annotated_new

    if not has_suspicious:
        return False

    print(prefix + annotated)
    for suspicious_descr in suspicious_descrs:
        print(suspicious_descr)

    return True

def scan_file(f, filename=None):
    """Scan an entire file-like object for suspicious characters."""
    found = False
    last_lineno = 0
    ## Empty files should not report "missing newline at end".
    #last_line = ""
    last_line = None

    for lineno, line in enumerate(f, 1):
        last_lineno = lineno
        last_line = line
        if scan_line(line, lineno=lineno, filename=filename):
            found = True

    if last_line is not None and not last_line.endswith('\n'):
        found = True
        ## Missing newline at the end is suspicious.
        msg = f"{filename or '<stdin>'}:{last_lineno}: " + colorize("[missing newline at end]", RED)
        print(msg)

    return found

if __name__ == "__main__":
    clean = True
    try:
        if len(sys.argv) > 1:
            for fname in sys.argv[1:]:
                try:
                    ## Must not use errors='replace' because otherwise suspicious unicode might slip.
                    ## Fail closed for non-UTF-8.
                    with open(fname, 'r', encoding='utf-8', errors='strict') as f:
                        if scan_file(f, filename=fname):
                            clean = False
                except UnicodeDecodeError as e:
                    print(f"[ERROR] Unicode decode error [{fname}]: {e}", file=sys.stderr)
                    clean = False
                except Exception as e:
                    print(f"[ERROR] File read error [{fname}]: {e}", file=sys.stderr)
                    sys.exit(2)
        else:
            try:
                stdin_buffer = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='strict')
                if scan_file(stdin_buffer):
                    clean = False
            except UnicodeDecodeError as e:
                print(f"[ERROR] Unicode decode error [stdin]: {e}", file=sys.stderr)
                clean = False
    except Exception as e:
        print(f"[ERROR] Unexpected error [main]: {e}", file=sys.stderr)
        sys.exit(2)

    if not clean:
        sys.exit(1)
