india-isin-data/src/fetch.sh

66 lines
1.8 KiB
Bash
Raw Permalink Normal View History

2021-06-10 06:59:06 +00:00
#!/bin/bash
2022-04-24 08:34:50 +00:00
export PUP_BINARY="$(which pup)"
if ! command -v pup &> /dev/null
then
2022-04-25 08:11:49 +00:00
wget --quiet https://github.com/ericchiang/pup/releases/download/v0.4.0/pup_v0.4.0_linux_amd64.zip -O pup.zip
echo "ec3d29e9fb375b87ac492c8b546ad6be84b0c0b49dab7ff4c6b582eac71ba01c pup.zip" | sha256sum --strict --check
2022-05-19 11:59:37 +00:00
unzip -o pup.zip
2022-04-24 08:34:50 +00:00
rm pup.zip
chmod +x ./pup
export PUP_BINARY="$(pwd)/pup"
fi
2021-06-10 06:59:06 +00:00
# Call with INX Page_num file_name
function fetch_page() {
echo "[+] $1/$2"
2021-06-10 06:59:06 +00:00
curl "https://nsdl.co.in/master_search_res.php" \
2022-05-07 06:22:28 +00:00
--no-progress-meter \
2022-04-24 08:34:50 +00:00
--user-agent "Mozilla/Gecko/Firefox/58.0" \
2022-05-14 03:35:53 +00:00
--retry 10 \
--connect-timeout 30 \
--retry-max-time 100 \
2021-06-10 06:59:06 +00:00
--data cnum=$1 \
2022-05-02 04:21:41 +00:00
--data "page_no=$2" | \
$PUP_BINARY '#nsdl-tables tr json{}' | \
2022-05-25 10:23:33 +00:00
# Generate a CSV (this contains the header row as well)
jq --raw-output '.[] | [.children[1].children[0].text, .children[2].text, .children[3].text,.children[4].text,.children[5].text]|@csv' | \
# Convert & to &
sed 's/&/\&/g' | \
# Drop the first row
tail -n +2 >> "$3"
2021-06-10 06:59:06 +00:00
}
function fetch_total_pages() {
curl "https://nsdl.co.in/master_search_res.php" \
2022-04-24 08:34:50 +00:00
--user-agent "Mozilla/Gecko/Firefox/58.0" \
2021-06-10 06:59:06 +00:00
--silent \
--data cnum=$1 \
--data "page_no=1" |
2022-04-24 08:34:50 +00:00
$PUP_BINARY 'input[name=total_page] attr{value}'
2021-06-10 06:59:06 +00:00
}
export -f fetch_page
function fetch_class() {
for i in $(seq 1 $2); do
2021-06-11 14:11:20 +00:00
sem -j 10 --timeout 500% fetch_page $1 $i "$1.csv"
2021-06-10 06:59:06 +00:00
done
}
2022-05-02 04:21:41 +00:00
CLASS="$1"
total=$(fetch_total_pages "$CLASS")
echo "::group::$CLASS (Total=$total)"
2022-05-19 06:52:06 +00:00
rm -f "$CLASS.csv"
2022-05-02 04:21:41 +00:00
fetch_class "$CLASS" $total
echo "::endgroup::"
2021-06-10 06:59:06 +00:00
sem --wait
2022-05-02 04:21:41 +00:00
# Sort the file in place
sort -o "$CLASS.csv" "$CLASS.csv"
# Remove lines that don't start with the correct prefix
# This is to avoid ISINs like INF955L01IN9 showing up under IN9
2022-05-25 10:23:33 +00:00
# Note that there is a " at the beginning to account for quoted CSVs
sed -i "/^\"$CLASS/!d" "$CLASS.csv"