india-isin-data/fetch.sh

59 lines
1.6 KiB
Bash
Raw Normal View History

2021-06-10 06:59:06 +00:00
#!/bin/bash
2022-04-24 08:34:50 +00:00
export PUP_BINARY="$(which pup)"
if ! command -v pup &> /dev/null
then
wget https://github.com/ericchiang/pup/releases/download/v0.4.0/pup_v0.4.0_linux_amd64.zip -O pup.zip
2022-04-25 07:12:22 +00:00
echo "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 pup.zip" | sha256sum --check
2022-04-24 08:34:50 +00:00
unzip pup.zip
rm pup.zip
chmod +x ./pup
export PUP_BINARY="$(pwd)/pup"
fi
2021-06-10 06:59:06 +00:00
# Call with INX Page_num file_name
function fetch_page() {
echo "[+] $1/$2"
2021-06-10 06:59:06 +00:00
curl "https://nsdl.co.in/master_search_res.php" \
2022-04-24 08:34:50 +00:00
--user-agent "Mozilla/Gecko/Firefox/58.0" \
2021-06-10 06:59:06 +00:00
--silent \
2022-04-24 08:34:50 +00:00
--retry 3 \
--connect-timeout 10 \
--retry-max-time 30 \
2021-06-10 06:59:06 +00:00
--data cnum=$1 \
--data "page_no=$2" | $PUP_BINARY '#nsdl-tables tr json{}' | \
# generate 6 lines (second column has a link, so parse that) with raw output
2021-06-11 14:11:20 +00:00
jq --raw-output '.[] | [.children[1].children[0].text, .children[2].text, .children[3].text,.children[4].text,.children[5].text]|.[]' | \
# and create a CSV from every 5 lines
paste -d, - - - - - | \
# and we don't need the first row
tail -n +2 >> "$3"
2021-06-10 06:59:06 +00:00
}
function fetch_total_pages() {
curl "https://nsdl.co.in/master_search_res.php" \
2022-04-24 08:34:50 +00:00
--user-agent "Mozilla/Gecko/Firefox/58.0" \
2021-06-10 06:59:06 +00:00
--silent \
--data cnum=$1 \
--data "page_no=1" |
2022-04-24 08:34:50 +00:00
$PUP_BINARY 'input[name=total_page] attr{value}'
2021-06-10 06:59:06 +00:00
}
export -f fetch_page
function fetch_class() {
for i in $(seq 1 $2); do
2021-06-11 14:11:20 +00:00
sem -j 10 --timeout 500% fetch_page $1 $i "$1.csv"
2021-06-10 06:59:06 +00:00
done
}
for i in E F 9; do
2021-06-10 06:59:06 +00:00
total=$(fetch_total_pages "IN$i")
echo "::group::IN$i (Total=$total)"
rm "IN$i.csv"
2021-06-10 06:59:06 +00:00
fetch_class "IN$i" $total
echo "::endgroup::"
# Sort the file in place
sort -o "IN$i.csv" "IN$i.csv"
2021-06-10 06:59:06 +00:00
done
sem --wait