Recursive fetch for metadata pages that require JS

This expands our coverage of gazette metadata to 100%
- Closes #3
This commit is contained in:
Nemo 2023-12-06 16:28:32 +05:30
parent d76b3e64bf
commit 8e67974e97
3 changed files with 2148 additions and 68 deletions

16
dl.sh
View File

@ -1,5 +1,8 @@
#!/bin/bash #!/bin/bash
# This script downloads all valid gazette files
# by brute-forcing all valid filenames (=date.pdf)
start_date="01 JAN 2006" start_date="01 JAN 2006"
current_date=$(date +"%d %b %Y") current_date=$(date +"%d %b %Y")
@ -9,7 +12,7 @@ download_gazette() {
dir="pdfs/20$(date -d "$1" +"%y/%m")" dir="pdfs/20$(date -d "$1" +"%y/%m")"
url="https://gazettes.uk.gov.in/entry/gazette/gz$formatted_date.pdf" url="https://gazettes.uk.gov.in/entry/gazette/gz$formatted_date.pdf"
mkdir -p "$dir" mkdir -p "$dir"
wget -nc -nv "$url" -O "$dir/$formatted_date.pdf" wget -q -nc -nv "$url" -O "$dir/$formatted_date.pdf"
} }
# Export the function so GNU Parallel can access it # Export the function so GNU Parallel can access it
@ -26,4 +29,13 @@ done
parallel -j 100 download_gazette ::: "${dates_to_download[@]}" parallel -j 100 download_gazette ::: "${dates_to_download[@]}"
find . -type f -empty -delete find . -type f -empty -delete
find . -type d -empty -delete find . -type d -empty -delete
# Now, we need to download the information files (HTML)
# that contain the metadata for each Gazette file
wget -i input.txt --recursive --adjust-extension --level 1 --ignore-tags=img,link --relative --no-parent
# Now, we parse the metadata from the HTML files
# and save it as a CSV file
# in some cases, this will make a few further requests
python generate.py

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,16 @@
import glob import glob
import csv import csv
from os import path
import sys
import urllib3
from collections import namedtuple
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs,urlunparse
import html5lib import html5lib
from datetime import datetime from datetime import datetime
# New function for pre-processing row_data # New function for pre-processing row_data
def preprocess_row_data(row_data): def preprocess_row_data(row_data):
# Step 1: Drop the 6th column (index=5) # Step 1: Drop the 6th column (index=5)
@ -19,50 +26,116 @@ def preprocess_row_data(row_data):
return row_data return row_data
# namedtuple to match the internal signature of urlunparse
Components = namedtuple(
typename='Components',
field_names=['scheme', 'netloc', 'url', 'path', 'query', 'fragment']
)
html_files = glob.glob("gazettes.uk.gov.in/showgrid*.html") html_files = glob.glob("gazettes.uk.gov.in/showgrid*.html")
# Step 7: Create a list to store all rows across all files # Step 7: Create a list to store all rows across all files
output_data = [] output_data = []
# Define the CSV file header # Define the CSV file header
header = ["GO No.", "GO Date", "GO Description", "Issued by", "Gazette Week Date", "Pg No"] header = [
"GO No.",
"GO Date",
"GO Description",
"Issued by",
"Gazette Week Date",
"Pg No",
]
# incomplete_links = []
# def start_js_scrape(url):
def parse_html(html):
# Step 3: Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html, "html5lib")
# Step 4: Extract the id=Datagrid1 table element
table = soup.find("table", {"id": "Datagrid1"})
# Step 5: Select all tr child elements and slice to exclude the first and last element
tr_elements = table.find_all("tr")[1:-1]
# Step 6: Extract and convert td elements to text, creating tuples for each row
rows = []
for tr in tr_elements:
td_elements = tr.find_all("td")
row_data = [td.get_text(strip=True) for td in td_elements]
# Pre-process the row_data
processed_row = preprocess_row_data(row_data)
rows.append(tuple(processed_row))
target = extract_target(table)
form = gen_form(soup, target) if target else None
return (rows,form)
def gen_form(soup, target):
inputs = soup.find_all("input")
data = {}
for input in inputs:
data[input.get("name")] = input.get("value")
data["__EVENTTARGET"] = target
del data['Button1']
return data
def extract_target(table):
target = None
links_row = table.find_all("tr")[-1]
current = links_row.find("span")
for link in current.find_next_siblings('a'):
if link.get_text(strip=True).strip().isdigit():
target = link["href"].split("'")[1]
break
return target
def file_to_url(filename):
parsed_url = urlparse(filename)
url = urlunparse(
Components(
scheme='https',
netloc='gazettes.uk.gov.in',
query=path.splitext(parsed_url.query)[0],
path='',
url='/showgrid.aspx',
fragment=''
)
)
return url
def iter_html(html):
rows,form = parse_html(html)
yield rows
# If we need to paginate this section
if form:
url = file_to_url(file)
t = form['__EVENTTARGET']
response = http.request_encode_body('POST', url, fields=form, encode_multipart=False)
yield from iter_html(response.data)
http = urllib3.PoolManager()
# Step 2: Iterate through each HTML file # Step 2: Iterate through each HTML file
for file in html_files: for file in html_files:
with open(file, 'r', encoding='utf-8') as f: print(f"Processing {file}")
# Step 3: Parse the HTML using BeautifulSoup with open(file, "r", encoding="utf-8") as html:
soup = BeautifulSoup(f, 'html5lib') for rows in iter_html(html):
output_data.extend(rows)
# Step 4: Extract the id=Datagrid1 table element
table = soup.find("table", {"id": "Datagrid1"})
# Step 5: Select all tr child elements and slice to exclude the first and last element
tr_elements = table.find_all("tr")[1:-1]
# Step 6: Extract and convert td elements to text, creating tuples for each row
rows = []
for tr in tr_elements:
td_elements = tr.find_all("td")
row_data = [td.get_text(strip=True) for td in td_elements]
# Pre-process the row_data
processed_row = preprocess_row_data(row_data)
rows.append(tuple(processed_row))
# Extend the output_data list with the rows from the current file
output_data.extend(rows)
# Sort the final list by the fifth column (Gazette Week Date) # Sort the final list by the fifth column (Gazette Week Date)
output_data.sort(key=lambda x: x[4]) output_data.sort(key=lambda x: x[4])
# Step 8: Dump the final output list into a CSV file # Step 8: Dump the final output list into a CSV file
with open("gazette_data.csv", 'w', newline='', encoding='utf-8') as csvfile: with open("gazette_data.csv", "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile) writer = csv.writer(csvfile)
# Write the header
writer.writerow(header) writer.writerow(header)
# Write the data
writer.writerows(output_data) writer.writerows(output_data)
print("CSV file 'gazette_data.csv' has been created.") print("CSV file 'gazette_data.csv' has been created.")