india-isin-data/fetch.sh

#!/bin/bash

export PUP_BINARY="$(which pup)"

if ! command -v pup &> /dev/null
then
  wget --quiet https://github.com/ericchiang/pup/releases/download/v0.4.0/pup_v0.4.0_linux_amd64.zip -O pup.zip
  echo "ec3d29e9fb375b87ac492c8b546ad6be84b0c0b49dab7ff4c6b582eac71ba01c  pup.zip" | sha256sum --strict --check
  unzip pup.zip
  rm pup.zip
  chmod +x ./pup
  export PUP_BINARY="$(pwd)/pup"
fi

# Call with INX Page_num file_name
function fetch_page() {
  echo "[+] $1/$2"
  curl "https://nsdl.co.in/master_search_res.php" \
    --no-progress-meter \
    --write-out '%{stderr}DL  :%{size_download}\nHTTP:%{response_code} E(%{errormsg})\n' \
    --user-agent "Mozilla/Gecko/Firefox/58.0" \
    --retry 3 \
    --connect-timeout 10 \
    --retry-max-time 30 \
    --data cnum=$1 \
    --data "page_no=$2" | \
  $PUP_BINARY '#nsdl-tables tr json{}' | \
  # generate 6 lines (second column has a link, so parse that) with raw output
  jq --raw-output '.[] | [.children[1].children[0].text, .children[2].text, .children[3].text,.children[4].text,.children[5].text]|.[]' | \
  # and create a CSV from every 5 lines
  paste -d, - - - - -  | \
  # and we don't need the first row
  tail -n +2 >> "$3"
}
function fetch_total_pages() {
  curl "https://nsdl.co.in/master_search_res.php" \
    --user-agent "Mozilla/Gecko/Firefox/58.0" \
    --silent \
    --data cnum=$1 \
    --data "page_no=1" |
  $PUP_BINARY 'input[name=total_page] attr{value}'
}
export -f fetch_page

function fetch_class() {
  for i in $(seq 1 $2); do
    echo fetch_page $1 $i "$1.csv"
    sem -j 10 --timeout 500% fetch_page $1 $i "$1.csv"
  done
}

CLASS="$1"

total=$(fetch_total_pages "$CLASS")
echo "::group::$CLASS (Total=$total)"
rm "$CLASS.csv"
fetch_class "$CLASS" $total
echo "::endgroup::"

sem --wait

# Sort the file in place
sort -o "$CLASS.csv" "$CLASS.csv"
# Remove lines that don't start with the correct prefix
# This is to avoid ISINs like INF955L01IN9 showing up under IN9
sed -i "/^$CLASS/!d" "$CLASS.csv"

# Update CITATION
if [[ $(git diff --stat *.csv) != '' ]]; then
  sed -i "s/^version.*/version: $1/" CITATION.cff
  sed -i "s/^date-released.*/date-released: $(date --rfc-3339=date)/" CITATION.cff

  jq ".version = \"$1\" | .created = \"$(date --rfc-3339=seconds)\"" datapackage.json > d2.json
  mv d2.json datapackage.json
  git add CITATION.cff datapackage.json
fi
Update 2021-06-10 06:59:06 +00:00			`#!/bin/bash`
Auto Updates FTW 2022-04-24 08:34:50 +00:00
			`export PUP_BINARY="$(which pup)"`

			`if ! command -v pup &> /dev/null`
			`then`
[skip ci] Fix checksum 2022-04-25 08:11:49 +00:00			`wget --quiet https://github.com/ericchiang/pup/releases/download/v0.4.0/pup_v0.4.0_linux_amd64.zip -O pup.zip`
			`echo "ec3d29e9fb375b87ac492c8b546ad6be84b0c0b49dab7ff4c6b582eac71ba01c pup.zip" \| sha256sum --strict --check`
Auto Updates FTW 2022-04-24 08:34:50 +00:00			`unzip pup.zip`
			`rm pup.zip`
			`chmod +x ./pup`
			`export PUP_BINARY="$(pwd)/pup"`
			`fi`

Update 2021-06-10 06:59:06 +00:00			`# Call with INX Page_num file_name`
			`function fetch_page() {`
Workaround for https://github.com/ericchiang/pup/issues/90 2021-06-10 07:56:54 +00:00			`echo "[+] $1/$2"`
Update 2021-06-10 06:59:06 +00:00			`curl "https://nsdl.co.in/master_search_res.php" \`
[ci] Enable debug logging 2022-05-07 06:22:28 +00:00			`--no-progress-meter \`
			`--write-out '%{stderr}DL :%{size_download}\nHTTP:%{response_code} E(%{errormsg})\n' \`
Auto Updates FTW 2022-04-24 08:34:50 +00:00			`--user-agent "Mozilla/Gecko/Firefox/58.0" \`
			`--retry 3 \`
			`--connect-timeout 10 \`
			`--retry-max-time 30 \`
Update 2021-06-10 06:59:06 +00:00			`--data cnum=$1 \`
Switch to a Makefile 2022-05-02 04:21:41 +00:00			`--data "page_no=$2" \| \`
			`$PUP_BINARY '#nsdl-tables tr json{}' \| \`
Workaround for https://github.com/ericchiang/pup/issues/90 2021-06-10 07:56:54 +00:00			`# generate 6 lines (second column has a link, so parse that) with raw output`
increase timeout 2021-06-11 14:11:20 +00:00			`jq --raw-output '.[] \| [.children[1].children[0].text, .children[2].text, .children[3].text,.children[4].text,.children[5].text]\|.[]' \| \`
			`# and create a CSV from every 5 lines`
			`paste -d, - - - - - \| \`
Workaround for https://github.com/ericchiang/pup/issues/90 2021-06-10 07:56:54 +00:00			`# and we don't need the first row`
			`tail -n +2 >> "$3"`
Update 2021-06-10 06:59:06 +00:00			`}`
			`function fetch_total_pages() {`
			`curl "https://nsdl.co.in/master_search_res.php" \`
Auto Updates FTW 2022-04-24 08:34:50 +00:00			`--user-agent "Mozilla/Gecko/Firefox/58.0" \`
Update 2021-06-10 06:59:06 +00:00			`--silent \`
			`--data cnum=$1 \`
			`--data "page_no=1" \|`
Auto Updates FTW 2022-04-24 08:34:50 +00:00			`$PUP_BINARY 'input[name=total_page] attr{value}'`
Update 2021-06-10 06:59:06 +00:00			`}`
			`export -f fetch_page`

			`function fetch_class() {`
			`for i in $(seq 1 $2); do`
Switch to a Makefile 2022-05-02 04:21:41 +00:00			`echo fetch_page $1 $i "$1.csv"`
increase timeout 2021-06-11 14:11:20 +00:00			`sem -j 10 --timeout 500% fetch_page $1 $i "$1.csv"`
Update 2021-06-10 06:59:06 +00:00			`done`
			`}`

Switch to a Makefile 2022-05-02 04:21:41 +00:00			`CLASS="$1"`

			`total=$(fetch_total_pages "$CLASS")`
			`echo "::group::$CLASS (Total=$total)"`
			`rm "$CLASS.csv"`
			`fetch_class "$CLASS" $total`
			`echo "::endgroup::"`
Update 2021-06-10 06:59:06 +00:00
Sort properly after download is complete 2022-04-30 10:20:59 +00:00			`sem --wait`

Switch to a Makefile 2022-05-02 04:21:41 +00:00			`# Sort the file in place`
			`sort -o "$CLASS.csv" "$CLASS.csv"`
			`# Remove lines that don't start with the correct prefix`
			`# This is to avoid ISINs like INF955L01IN9 showing up under IN9`
			`sed -i "/^$CLASS/!d" "$CLASS.csv"`
Add CITATION file [skip ci] 2022-04-30 11:24:25 +00:00
			`# Update CITATION`
Add a datapackage 2022-04-30 14:27:37 +00:00			`if [[ $(git diff --stat *.csv) != '' ]]; then`
			`sed -i "s/^version.*/version: $1/" CITATION.cff`
			`sed -i "s/^date-released.*/date-released: $(date --rfc-3339=date)/" CITATION.cff`

			`jq ".version = \"$1\" \| .created = \"$(date --rfc-3339=seconds)\"" datapackage.json > d2.json`
			`mv d2.json datapackage.json`
			`git add CITATION.cff datapackage.json`
			`fi`