From 5bd5eae1db07ada543c5ecacb6069b539958bd33 Mon Sep 17 00:00:00 2001 From: KatolaZ Date: Thu, 9 Jan 2020 08:09:15 +0000 Subject: refactor parse_cgit --- parse_cgit | 75 ++++++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 19 deletions(-) (limited to 'parse_cgit') diff --git a/parse_cgit b/parse_cgit index 649a692..d6951af 100755 --- a/parse_cgit +++ b/parse_cgit @@ -12,48 +12,85 @@ fi FIN="/dev/stdin" URLBASE="$1" DEST="$2" - SUBPATH="/plain" - CURL="torify curl -Ls " - -PROTO=${URLBASE%%:\/\/*} -DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}" -echo "proto: $PROTO" -echo "dirbase: $DIRBASE" - +TMPFILE="./tmp_$$" READMES="README README.txt README.md readme readme.txt readme.md" +## func +cleanup () { + rm -f $TMPFILE + exit +} + +## func +get_repos() { repos=$(${CURL} "$URLBASE" | xml2tsv | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+title=" \ | awk -F'\t' '{print $3,$4}' \ | sed -E 's/href=//g;s/ /\|/'\ ) +} +## func +get_descr() { + reponame=$1 + url=$2 + echo "reponame: $reponame" + echo "url: $url" + ## Get author and description -for r in $repos; do - printf "Retrieving repo %s...\n" $repo - link=$(echo "$r" | cut -d "|" -f 1 ) - name=$(echo "$r" | cut -d "|" -f 2 ) - baselink=$(printf "%s%s" $URLBASE $link) - REPODIR="$DEST/$DIRBASE/$link/" - mkdir -p $REPODIR + $CURL "$url" | xml2tsv | grep -Ei "/html/body/div/table/tr/td[[:blank:]]+class=sub" | \ +## cat "$TMPFILE" | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+href=$link" |\ + tail -2 | cut -f 3- + +} + +## func +get_readmes() { + LINK=$1 + DESTDIR=$2 for f in $READMES; do - printf " trying file %s..." $baselink/$SUBPATH/$f - torify curl -Ls "$baselink/$SUBPATH/$f" > $REPODIR/$f - failure=$(xml2tsv < $REPODIR/$f 2>/dev/null | \ + printf " trying file %s..." $LINK/$f + $CURL "$LINK/$f" > $DESTDIR/$f + failure=$(xml2tsv < $DESTDIR/$f 2>/dev/null | \ grep -Eaic "/html/body/div/div/div[[:blank:]]+class=error[[:blank:]]+Not found") echo $failure if [ "$failure" != 0 ]; then - rm -f $REPODIR/$f + rm -f $DESTDIR/$f printf "[FAILED]\n" else printf "[OK]\n" fi sleep 1 done +} + +PROTO=${URLBASE%%:\/\/*} +DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}" +echo "proto: $PROTO" +echo "dirbase: $DIRBASE" + +trap cleanup EXIT KILL TERM INT + +$CURL "${URLBASE}" | xml2tsv > $TMPFILE + +get_repos + +for r in $repos; do + printf "Retrieving repo %s...\n" $repo + link=$(echo "$r" | cut -d "|" -f 1 ) + name=$(echo "$r" | cut -d "|" -f 2 ) + baselink=$(printf "%s%s" $URLBASE $link) + REPODIR="$DEST/$DIRBASE/$link/" + mkdir -p $REPODIR + get_descr "$link" "$baselink" > ${REPODIR}/DESCR + ## Get READMEs + get_readmes "$baselink/$SUBPATH" "$REPODIR" done +clenaup + ### the readme file is at REPLINK/plain/README ### if not found, look for "/html/body/div/div/div class=error Not found" -- cgit v1.2.3