blob: d6951afc6201092c3fd3bc1ba49253770b454c9a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
#!/bin/sh
#
# Parse a cgit-style repo index provided on stdin
#
if [ $# -lt 2 ]; then
printf "Usage: %s <urlbase> <destdir>\n" $0
exit 1
fi
FIN="/dev/stdin"
URLBASE="$1"
DEST="$2"
SUBPATH="/plain"
CURL="torify curl -Ls "
TMPFILE="./tmp_$$"
READMES="README README.txt README.md readme readme.txt readme.md"
## func
cleanup () {
rm -f $TMPFILE
exit
}
## func
get_repos() {
repos=$(${CURL} "$URLBASE" | xml2tsv | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+title=" \
| awk -F'\t' '{print $3,$4}' \
| sed -E 's/href=//g;s/ /\|/'\
)
}
## func
get_descr() {
reponame=$1
url=$2
echo "reponame: $reponame"
echo "url: $url"
## Get author and description
$CURL "$url" | xml2tsv | grep -Ei "/html/body/div/table/tr/td[[:blank:]]+class=sub" | \
## cat "$TMPFILE" | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+href=$link" |\
tail -2 | cut -f 3-
}
## func
get_readmes() {
LINK=$1
DESTDIR=$2
for f in $READMES; do
printf " trying file %s..." $LINK/$f
$CURL "$LINK/$f" > $DESTDIR/$f
failure=$(xml2tsv < $DESTDIR/$f 2>/dev/null | \
grep -Eaic "/html/body/div/div/div[[:blank:]]+class=error[[:blank:]]+Not found")
echo $failure
if [ "$failure" != 0 ]; then
rm -f $DESTDIR/$f
printf "[FAILED]\n"
else
printf "[OK]\n"
fi
sleep 1
done
}
PROTO=${URLBASE%%:\/\/*}
DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}"
echo "proto: $PROTO"
echo "dirbase: $DIRBASE"
trap cleanup EXIT KILL TERM INT
$CURL "${URLBASE}" | xml2tsv > $TMPFILE
get_repos
for r in $repos; do
printf "Retrieving repo %s...\n" $repo
link=$(echo "$r" | cut -d "|" -f 1 )
name=$(echo "$r" | cut -d "|" -f 2 )
baselink=$(printf "%s%s" $URLBASE $link)
REPODIR="$DEST/$DIRBASE/$link/"
mkdir -p $REPODIR
get_descr "$link" "$baselink" > ${REPODIR}/DESCR
## Get READMEs
get_readmes "$baselink/$SUBPATH" "$REPODIR"
done
clenaup
### the readme file is at REPLINK/plain/README
### if not found, look for "/html/body/div/div/div class=error Not found"
|