#!/bin/sh # # Parse a cgit-style repo index provided on stdin # if [ $# -lt 2 ]; then printf "Usage: %s \n" $0 exit 1 fi FIN="/dev/stdin" URLBASE="$1" DEST="$2" SUBPATH="/plain" CURL="torify curl -Ls " PROTO=${URLBASE%%:\/\/*} DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}" echo "proto: $PROTO" echo "dirbase: $DIRBASE" READMES="README README.txt README.md readme readme.txt readme.md" repos=$(${CURL} "$URLBASE" | xml2tsv | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+title=" \ | awk -F'\t' '{print $3,$4}' \ | sed -E 's/href=//g;s/ /\|/'\ ) for r in $repos; do printf "Retrieving repo %s...\n" $repo link=$(echo "$r" | cut -d "|" -f 1 ) name=$(echo "$r" | cut -d "|" -f 2 ) baselink=$(printf "%s%s" $URLBASE $link) REPODIR="$DEST/$DIRBASE/$link/" mkdir -p $REPODIR for f in $READMES; do printf " trying file %s..." $baselink/$SUBPATH/$f torify curl -Ls "$baselink/$SUBPATH/$f" > $REPODIR/$f failure=$(xml2tsv < $REPODIR/$f 2>/dev/null | \ grep -Eaic "/html/body/div/div/div[[:blank:]]+class=error[[:blank:]]+Not found") echo $failure if [ "$failure" != 0 ]; then rm -f $REPODIR/$f printf "[FAILED]\n" else printf "[OK]\n" fi #sleep 1 done done ### the readme file is at REPLINK/plain/README ### if not found, look for "/html/body/div/div/div class=error Not found"