uk-gazette-archive/dl.sh

41 lines
1.3 KiB
Bash
Executable File

#!/bin/bash
# This script downloads all valid gazette files
# by brute-forcing all valid filenames (=date.pdf)
start_date="01 JAN 2006"
current_date=$(date +"%d %b %Y")
# Create a function to download Gazette files
download_gazette() {
formatted_date=$(date -d "$1" +"%d%m%y")
dir="pdfs/20$(date -d "$1" +"%y/%m")"
url="https://gazettes.uk.gov.in/entry/gazette/gz$formatted_date.pdf"
mkdir -p "$dir"
wget -q -nc -nv "$url" -O "$dir/$formatted_date.pdf"
}
# Export the function so GNU Parallel can access it
export -f download_gazette
# Generate a list of dates to download Gazette files
dates_to_download=()
while [ "$start_date" != "$current_date" ]; do
dates_to_download+=("$start_date")
start_date=$(date -d "$start_date + 1 day" +"%d %b %Y")
done
# Use GNU Parallel to download Gazette files in parallel
parallel -j 100 download_gazette ::: "${dates_to_download[@]}"
find . -type f -empty -delete
find . -type d -empty -delete
# Now, we need to download the information files (HTML)
# that contain the metadata for each Gazette file
wget -i input.txt --recursive --adjust-extension --level 1 --ignore-tags=img,link --relative --no-parent
# Now, we parse the metadata from the HTML files
# and save it as a CSV file
# in some cases, this will make a few further requests
python generate.py