2021-06-10 06:59:06 +00:00
|
|
|
#!/bin/bash
|
2022-04-24 08:34:50 +00:00
|
|
|
|
|
|
|
export PUP_BINARY="$(which pup)"
|
|
|
|
|
|
|
|
if ! command -v pup &> /dev/null
|
|
|
|
then
|
2022-04-25 08:11:49 +00:00
|
|
|
wget --quiet https://github.com/ericchiang/pup/releases/download/v0.4.0/pup_v0.4.0_linux_amd64.zip -O pup.zip
|
|
|
|
echo "ec3d29e9fb375b87ac492c8b546ad6be84b0c0b49dab7ff4c6b582eac71ba01c pup.zip" | sha256sum --strict --check
|
2022-04-24 08:34:50 +00:00
|
|
|
unzip pup.zip
|
|
|
|
rm pup.zip
|
|
|
|
chmod +x ./pup
|
|
|
|
export PUP_BINARY="$(pwd)/pup"
|
|
|
|
fi
|
|
|
|
|
2021-06-10 06:59:06 +00:00
|
|
|
# Call with INX Page_num file_name
|
|
|
|
function fetch_page() {
|
2021-06-10 07:56:54 +00:00
|
|
|
echo "[+] $1/$2"
|
2021-06-10 06:59:06 +00:00
|
|
|
curl "https://nsdl.co.in/master_search_res.php" \
|
2022-05-07 06:22:28 +00:00
|
|
|
--no-progress-meter \
|
|
|
|
--write-out '%{stderr}DL :%{size_download}\nHTTP:%{response_code} E(%{errormsg})\n' \
|
2022-04-24 08:34:50 +00:00
|
|
|
--user-agent "Mozilla/Gecko/Firefox/58.0" \
|
|
|
|
--retry 3 \
|
|
|
|
--connect-timeout 10 \
|
|
|
|
--retry-max-time 30 \
|
2021-06-10 06:59:06 +00:00
|
|
|
--data cnum=$1 \
|
2022-05-02 04:21:41 +00:00
|
|
|
--data "page_no=$2" | \
|
|
|
|
$PUP_BINARY '#nsdl-tables tr json{}' | \
|
2021-06-10 07:56:54 +00:00
|
|
|
# generate 6 lines (second column has a link, so parse that) with raw output
|
2021-06-11 14:11:20 +00:00
|
|
|
jq --raw-output '.[] | [.children[1].children[0].text, .children[2].text, .children[3].text,.children[4].text,.children[5].text]|.[]' | \
|
|
|
|
# and create a CSV from every 5 lines
|
|
|
|
paste -d, - - - - - | \
|
2021-06-10 07:56:54 +00:00
|
|
|
# and we don't need the first row
|
|
|
|
tail -n +2 >> "$3"
|
2021-06-10 06:59:06 +00:00
|
|
|
}
|
|
|
|
function fetch_total_pages() {
|
|
|
|
curl "https://nsdl.co.in/master_search_res.php" \
|
2022-04-24 08:34:50 +00:00
|
|
|
--user-agent "Mozilla/Gecko/Firefox/58.0" \
|
2021-06-10 06:59:06 +00:00
|
|
|
--silent \
|
|
|
|
--data cnum=$1 \
|
|
|
|
--data "page_no=1" |
|
2022-04-24 08:34:50 +00:00
|
|
|
$PUP_BINARY 'input[name=total_page] attr{value}'
|
2021-06-10 06:59:06 +00:00
|
|
|
}
|
|
|
|
export -f fetch_page
|
|
|
|
|
|
|
|
function fetch_class() {
|
|
|
|
for i in $(seq 1 $2); do
|
2022-05-02 04:21:41 +00:00
|
|
|
echo fetch_page $1 $i "$1.csv"
|
2021-06-11 14:11:20 +00:00
|
|
|
sem -j 10 --timeout 500% fetch_page $1 $i "$1.csv"
|
2021-06-10 06:59:06 +00:00
|
|
|
done
|
|
|
|
}
|
|
|
|
|
2022-05-02 04:21:41 +00:00
|
|
|
CLASS="$1"
|
|
|
|
|
|
|
|
total=$(fetch_total_pages "$CLASS")
|
|
|
|
echo "::group::$CLASS (Total=$total)"
|
|
|
|
rm "$CLASS.csv"
|
|
|
|
fetch_class "$CLASS" $total
|
|
|
|
echo "::endgroup::"
|
2021-06-10 06:59:06 +00:00
|
|
|
|
2022-04-30 10:20:59 +00:00
|
|
|
sem --wait
|
|
|
|
|
2022-05-02 04:21:41 +00:00
|
|
|
# Sort the file in place
|
|
|
|
sort -o "$CLASS.csv" "$CLASS.csv"
|
|
|
|
# Remove lines that don't start with the correct prefix
|
|
|
|
# This is to avoid ISINs like INF955L01IN9 showing up under IN9
|
|
|
|
sed -i "/^$CLASS/!d" "$CLASS.csv"
|
2022-04-30 11:24:25 +00:00
|
|
|
|
|
|
|
# Update CITATION
|
2022-04-30 14:27:37 +00:00
|
|
|
if [[ $(git diff --stat *.csv) != '' ]]; then
|
|
|
|
sed -i "s/^version.*/version: $1/" CITATION.cff
|
|
|
|
sed -i "s/^date-released.*/date-released: $(date --rfc-3339=date)/" CITATION.cff
|
|
|
|
|
|
|
|
jq ".version = \"$1\" | .created = \"$(date --rfc-3339=seconds)\"" datapackage.json > d2.json
|
|
|
|
mv d2.json datapackage.json
|
|
|
|
git add CITATION.cff datapackage.json
|
|
|
|
fi
|