#!/bin/bash

## example:
## mw-file-to-web-archive ~/sourcesown/web-links.txt

set -e

source /usr/share/mediawiki-shell/common

[[ -v wait_if_too_many_requests ]] || wait_if_too_many_requests=10

[[ -v curl_web_archive_opts ]] || curl_web_archive_opts="\
   --output $TMPFOLDER/webarchive/curl-output-temp \
   --silent \
   --no-progress-meter \
   --retry-connrefused \
   --head \
   --location \
   -w %{http_code} \
"

mkdir --parents "$TMPFOLDER/webarchive"
touch "$TMPFOLDER/webarchive/already-archived.txt"
touch "$TMPFOLDER/webarchive/already-known-down.txt"

echo "$TMPFOLDER/webarchive/already-archived.txt"
echo "$TMPFOLDER/webarchive/already-known-down.txt"

archive_check() {
   echo "Checking if already archived by asking web archive... https://web.archive.org/web/$1"

   curl_http_status_code=$($curl $curl_web_archive_opts "https://web.archive.org/web/$1") || true
   curl_output="$(cat "$TMPFOLDER/webarchive/curl-output-temp")"

   if [ "$curl_http_status_code" = "403" ]; then
      echo "Excluded from web archive."
      echo "$1" | tee -a "$TMPFOLDER/webarchive/already-known-excluded.txt"
      return 0
   fi

   if [ "$curl_http_status_code" = "404" ]; then
      echo "Down according to web archive. curl_http_status_code: '$curl_http_status_code'"
      echo "$1" | tee -a "$TMPFOLDER/webarchive/already-known-down.txt"
      return 0
   fi

   location=$(echo "$curl_output" | grep '^location:')
   link=$(echo "$location" | awk '{ print $2 }')
   if echo "$link" | grep --quiet --fixed-strings "https://web.archive.org/web/" ; then
      echo "Already archived according to the web archive. curl_http_status_code: '$curl_http_status_code'"
      echo "$1" | tee -a "$TMPFOLDER/webarchive/already-archived.txt"
      echo "$link" | tee -a "$TMPFOLDER/webarchive/already-archived.txt"
      return 0
   fi

   echo "Not yet archived. curl_http_status_code: $curl_http_status_code | curl_output:"
   echo "$curl_output"
   return 1
}

archive_save() {
   echo "Requesting web archive link... https://web.archive.org/save/$1"

   curl_http_status_code=$($curl $curl_web_archive_opts "https://web.archive.org/save/$1") || true
   curl_output="$(cat "$TMPFOLDER/webarchive/curl-output-temp")"

   if [ "$curl_http_status_code" = "302" ]; then
      echo "Archived. Web archive reported that archiving succeeded."
      location=$(echo "$curl_output" | grep '^location:')
      link=$(echo "$location" | awk '{ print $2 }')
      echo "$link"
      return 0
   fi

   if [ "$curl_http_status_code" = "523" ]; then
      echo "Origin link down."
      echo "$1" | tee -a "$TMPFOLDER/webarchive/already-known-down.txt"
      return 0
   fi

   if [ "$curl_http_status_code" = "429" ]; then
      echo "Web archive server denied request, reported too many requests. Waiting for $wait_if_too_many_requests seconds."
      return 1
   fi

   if [ "$curl_http_status_code" = "520" ]; then
      echo "Web archive unknown error."
      echo "$1" | tee -a "$TMPFOLDER/webarchive/already-known-unknown-error.txt"
      return 0
   fi

   if [ "$curl_http_status_code" = "403" ]; then
      echo "Excluded from web archive."
      echo "$1" | tee -a "$TMPFOLDER/webarchive/already-known-excluded.txt"
      return 0
   fi

   echo "Unknown case. curl_http_status_code: $curl_http_status_code | curl_output:"
   echo "$curl_output"
   echo "$1" | tee -a "$TMPFOLDER/webarchive/already-unknown-case-error.txt"
   return 0
}

#set -x
set -e

## TODO: no root check

file_name="$TMPFOLDER/wiki-links/links-sorted.txt"

test -r "$file_name"

total="$(awk 'END {print NR}' "$file_name")"

counter=0

cat < "$file_name" | while IFS= read -r word; do
   if [ "$word" = "" ]; then
      continue
   fi

   counter=$(( counter + 1 ))
   echo "$counter / $total | word: '$word'"

   echo "Checking if already archived in locally cached file..."

   do_skip=""
   for cached_file in $TMPFOLDER/webarchive/already-* ; do
      if grep --quiet --fixed-string "$word" "$cached_file" ; then
         echo "Already locally cached file status. cached_file: '$cached_file'"
         do_skip=true
      fi
   done
   if [ "$do_skip" = "true" ]; then
      do_skip=true
      continue
   fi

   for retry_counter in $(seq 1 60) ; do
      if archive_check "$word" ; then
         break
      fi

      if archive_save "$word" ; then
         break
      fi
      sleep "$wait_if_too_many_requests"
   done

   echo ""
done
