#!/bin/sh

#
# Parse a cgit-style repo index provided on stdin
#

if [ $# -lt 2 ]; then 
	printf "Usage: %s <urlbase> <destdir>\n" $0
	exit 1
fi

FIN="/dev/stdin"
URLBASE="$1"
DEST="$2"

SUBPATH="/plain"

CURL="torify curl -Ls "

PROTO=${URLBASE%%:\/\/*}
DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}"
echo "proto: $PROTO"
echo "dirbase: $DIRBASE"

READMES="README README.txt README.md readme readme.txt readme.md"


repos=$(${CURL} "$URLBASE" | xml2tsv | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+title=" \
		| awk -F'\t' '{print $3,$4}' \
		| sed -E 's/href=//g;s/ /\|/'\
		)


for r in $repos; do
	printf "Retrieving repo %s...\n" $repo
	link=$(echo "$r" | cut -d "|" -f 1 )
	name=$(echo "$r" | cut -d "|" -f 2 )
	baselink=$(printf "%s%s" $URLBASE $link)
	REPODIR="$DEST/$DIRBASE/$link/"
	mkdir -p $REPODIR
	for f in $READMES; do
		printf "    trying file %s..." $baselink/$SUBPATH/$f
		torify curl -Ls "$baselink/$SUBPATH/$f" > $REPODIR/$f
		failure=$(xml2tsv < $REPODIR/$f 2>/dev/null | \
			grep -Eaic "/html/body/div/div/div[[:blank:]]+class=error[[:blank:]]+Not found")
		echo $failure
		if [ "$failure" != 0 ]; then 
			rm -f $REPODIR/$f
			printf "[FAILED]\n"
		else
			printf "[OK]\n"
		fi
		#sleep 1
	done
done

### the readme file is at REPLINK/plain/README

### if not found, look for "/html/body/div/div/div  class=error     Not found"