#!/bin/bash

## Copyright (C) 2022 - 2025 ENCRYPTED SUPPORT LLC <adrelanos@whonix.org>
## See the file COPYING for copying conditions.

## https://www.kicksecure.com/wiki/Unicode#grep-find-unicode-wrapper

set -o errexit
set -o nounset
set -o errtrace
set -o pipefail

check_grep_status() {
  if [ "$1" = "0" ]; then
    true "$0: INFO: Match."
  elif [ "$1" = "1" ]; then
    true "$0: INFO: No match."
  else
    printf '%s\n' "$0: ERROR: grep (syntax?) error! Exiting with code code '$1'." >&2
    exit "$1"
  fi
}

command -v stecho >/dev/null
command -v sort >/dev/null

## end-of-options ("--"):
## There is intentionally not use of end-of-options, because this a wrapper
## around 'grep' and the user is supposed to inject their own command line
## options such as for example '--recursive'. It remains the responsibility of
## the user to use end-of-options.

## --binary-files=text - required to find backspace and null character
grep_args=(
  --files-with-matches
  --line-number
  --binary-files=text
)

set +o errexit

one="$(LC_ALL=C grep "${grep_args[@]}" --perl-regexp '[^\x00-\x7F]' "$@")"
check_grep_status "$?"

two="$(LC_ALL=C grep "${grep_args[@]}" --perl-regexp "[^[:ascii:]]" "$@")"
check_grep_status "$?"

## https://access.redhat.com/security/vulnerabilities/RHSB-2021-007
## https://lintian.debian.org/tags/unicode-trojan
##
## '--perl-regexp':
## Not using 'grep's '--perl-regexp' option for three.
## Because not mentioned in above links and can lead to the following error message:
# grep: PCRE2 does not support \F, \L, \l, \N{name}, \U, or \u
#three="$(LC_ALL=C grep "${grep_args[@]}" $'[\u061C\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]' "$@")"
## Using different syntax to avoid encoding issue.
##
## testfile content:
## 15.0.4
##
## hexdump:
## 0000000 3531 302e 342e 000a
## 0000007
##
## sha512sum testfile
## 0196f0043f8a77fb1d757acae601b460026b02857341c3ec407fbf0a88471882eae5beaa517afebabc63fabe21c84d10a2a67f9655ea2026d9b9b8f61969f585
three="$(LC_ALL=C grep "${grep_args[@]}" --fixed-strings \
  -e $'\xD8\x9C' \
  -e $'\xE2\x80\x8E' \
  -e $'\xE2\x80\x8F' \
  -e $'\xE2\x80\xAA' \
  -e $'\xE2\x80\xAB' \
  -e $'\xE2\x80\xAC' \
  -e $'\xE2\x80\xAD' \
  -e $'\xE2\x80\xAE' \
  -e $'\xE2\x81\xA6' \
  -e $'\xE2\x81\xA7' \
  -e $'\xE2\x81\xA8' \
  -e $'\xE2\x81\xA9' \
  "$@")"
check_grep_status "$?"

## ASCII control characters.
four="$(LC_ALL=C grep "${grep_args[@]}" --perl-regexp '[\x00-\x08\x0B\x0C\x0D\x0E-\x1F\x7F]' "$@")"
check_grep_status "$?"

set -o errexit

#result="\
#$one
#$two
#$three
#$four"
## Problem: Extraneous newline at the end.
#output_message="$(printf '%s' "$result" | sort --unique)"

output_message="$(
  {
    [ -n "${one:-}"   ] && printf '%s\n' "$one"   || true
    [ -n "${two:-}"   ] && printf '%s\n' "$two"   || true
    [ -n "${three:-}" ] && printf '%s\n' "$three" || true
    [ -n "${four:-}"  ] && printf '%s\n' "$four"  || true
  } | sort --unique
)"

if [ "$output_message" = "" ]; then
  ## No matches found, therefore 'exit 1'.
  ## This is consistent with 'grep', which also exits non-zero if no match has been found.
  exit 1
fi

## Use 'stecho' in case the file names itself contain unicode.
stecho "$output_message"

## Matches found, therefore 'exit 0'.
## This is consistent with 'grep', which also exits 0 if matches have been found.
