pulse/dl.sh

32 lines
1.6 KiB
Bash
Raw Normal View History

2018-01-14 20:11:29 +00:00
#!/bin/bash
2019-01-14 10:50:06 +00:00
doit() {
ID=$1
curl -s "http://goidirectory.nic.in/sitecounter.php?id=$ID" |grep '<meta http-equiv="refresh"' >> urls.raw.txt
}
export -f doit
2018-01-14 20:11:29 +00:00
# Download from the goidirectory website
2019-01-14 10:50:06 +00:00
echo "Downloading from the GOI Directory"
2018-12-25 10:52:32 +00:00
for i in $(seq 1 14648); do
2019-01-14 10:50:06 +00:00
sem -j30 doit $i
2018-01-14 20:11:29 +00:00
done
2019-01-14 10:50:06 +00:00
sem --wait
2018-01-14 20:11:29 +00:00
# Get crts issued in gov.in
# TODO: This isn't as good as the censys.io export
# that pulse uses, switch to that
2019-01-14 10:50:06 +00:00
echo "Downloading from the crt.sh database"
echo "select name_value FROM certificate_identity WHERE name_value LIKE '%.gov.in' LIMIT 1000;" | psql -h crt.sh -p 5432 -U guest certwatch > crt.sh.domains.txt
echo "select name_value FROM certificate_identity WHERE name_value LIKE '%.gov.in' LIMIT 1000 OFFSET 1000;" | psql -h crt.sh -p 5432 -U guest certwatch >> crt.sh.domains.txt
echo "select name_value FROM certificate_identity WHERE name_value LIKE '%.gov.in' LIMIT 1000 OFFSET 2000;" | psql -h crt.sh -p 5432 -U guest certwatch >> crt.sh.domains.txt
echo "select name_value FROM certificate_identity WHERE name_value LIKE '%.gov.in' LIMIT 1000 OFFSET 3000;" | psql -h crt.sh -p 5432 -U guest certwatch >> crt.sh.domains.txt
echo "select name_value FROM certificate_identity WHERE name_value LIKE '%.gov.in' LIMIT 1000 OFFSET 4000;" | psql -h crt.sh -p 5432 -U guest certwatch >> crt.sh.domains.txt
echo "select name_value FROM certificate_identity WHERE name_value LIKE '%.gov.in' LIMIT 1000 OFFSET 5000;" | psql -h crt.sh -p 5432 -U guest certwatch >> crt.sh.domains.txt
2018-01-14 20:11:29 +00:00
2019-01-14 10:50:06 +00:00
echo "Combining them together"
2018-01-14 20:11:29 +00:00
cat urls.raw.txt crt.sh.domains.txt |sort -u > combined.txt
php parse.php | sort -u > domains.csv