summaryrefslogtreecommitdiff
path: root/parse_cgit
diff options
context:
space:
mode:
authorKatolaZ <katolaz@freaknet.org>2020-01-08 07:45:14 +0000
committerKatolaZ <katolaz@freaknet.org>2020-01-08 07:45:14 +0000
commitbc3a1f13fd5fddb9fa8f3ecff6144b512ed9d08b (patch)
treeccff9f0741a92a65f5aebd3d9c155aa6868bfe45 /parse_cgit
initial commit
Diffstat (limited to 'parse_cgit')
-rwxr-xr-xparse_cgit59
1 files changed, 59 insertions, 0 deletions
diff --git a/parse_cgit b/parse_cgit
new file mode 100755
index 0000000..34029fe
--- /dev/null
+++ b/parse_cgit
@@ -0,0 +1,59 @@
+#!/bin/sh
+
+#
+# Parse a cgit-style repo index provided on stdin
+#
+
+if [ $# -lt 2 ]; then
+ printf "Usage: %s <urlbase> <destdir>\n" $0
+ exit 1
+fi
+
+FIN="/dev/stdin"
+URLBASE="$1"
+DEST="$2"
+
+SUBPATH="/plain"
+
+CURL="torify curl -Ls "
+
+PROTO=${URLBASE%%:\/\/*}
+DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}"
+echo "proto: $PROTO"
+echo "dirbase: $DIRBASE"
+
+READMES="README README.txt README.md readme readme.txt readme.md"
+
+
+repos=$(${CURL} "$URLBASE" | xml2tsv | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+title=" \
+ | awk -F'\t' '{print $3,$4}' \
+ | sed -E 's/href=//g;s/ /\|/'\
+ )
+
+
+for r in $repos; do
+ printf "Retrieving repo %s...\n" $repo
+ link=$(echo "$r" | cut -d "|" -f 1 )
+ name=$(echo "$r" | cut -d "|" -f 2 )
+ baselink=$(printf "%s%s" $URLBASE $link)
+ REPODIR="$DEST/$DIRBASE/$link/"
+ mkdir -p $REPODIR
+ for f in $READMES; do
+ printf " trying file %s..." $baselink/$SUBPATH/$f
+ torify curl -Ls "$baselink/$SUBPATH/$f" > $REPODIR/$f
+ failure=$(xml2tsv < $REPODIR/$f 2>/dev/null | \
+ grep -Eaic "/html/body/div/div/div[[:blank:]]+class=error[[:blank:]]+Not found")
+ echo $failure
+ if [ "$failure" != 0 ]; then
+ rm -f $REPODIR/$f
+ printf "[FAILED]\n"
+ else
+ printf "[OK]\n"
+ fi
+ #sleep 1
+ done
+done
+
+### the readme file is at REPLINK/plain/README
+
+### if not found, look for "/html/body/div/div/div class=error Not found"