Recursive fetch for metadata pages that require JS
This expands our coverage of gazette metadata to 100% - Closes #3
This commit is contained in:
parent
d76b3e64bf
commit
8e67974e97
16
dl.sh
16
dl.sh
|
@ -1,5 +1,8 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script downloads all valid gazette files
|
||||||
|
# by brute-forcing all valid filenames (=date.pdf)
|
||||||
|
|
||||||
start_date="01 JAN 2006"
|
start_date="01 JAN 2006"
|
||||||
current_date=$(date +"%d %b %Y")
|
current_date=$(date +"%d %b %Y")
|
||||||
|
|
||||||
|
@ -9,7 +12,7 @@ download_gazette() {
|
||||||
dir="pdfs/20$(date -d "$1" +"%y/%m")"
|
dir="pdfs/20$(date -d "$1" +"%y/%m")"
|
||||||
url="https://gazettes.uk.gov.in/entry/gazette/gz$formatted_date.pdf"
|
url="https://gazettes.uk.gov.in/entry/gazette/gz$formatted_date.pdf"
|
||||||
mkdir -p "$dir"
|
mkdir -p "$dir"
|
||||||
wget -nc -nv "$url" -O "$dir/$formatted_date.pdf"
|
wget -q -nc -nv "$url" -O "$dir/$formatted_date.pdf"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Export the function so GNU Parallel can access it
|
# Export the function so GNU Parallel can access it
|
||||||
|
@ -26,4 +29,13 @@ done
|
||||||
parallel -j 100 download_gazette ::: "${dates_to_download[@]}"
|
parallel -j 100 download_gazette ::: "${dates_to_download[@]}"
|
||||||
|
|
||||||
find . -type f -empty -delete
|
find . -type f -empty -delete
|
||||||
find . -type d -empty -delete
|
find . -type d -empty -delete
|
||||||
|
|
||||||
|
# Now, we need to download the information files (HTML)
|
||||||
|
# that contain the metadata for each Gazette file
|
||||||
|
wget -i input.txt --recursive --adjust-extension --level 1 --ignore-tags=img,link --relative --no-parent
|
||||||
|
|
||||||
|
# Now, we parse the metadata from the HTML files
|
||||||
|
# and save it as a CSV file
|
||||||
|
# in some cases, this will make a few further requests
|
||||||
|
python generate.py
|
2071
gazette_data.csv
2071
gazette_data.csv
File diff suppressed because it is too large
Load Diff
129
generate.py
129
generate.py
|
@ -1,9 +1,16 @@
|
||||||
import glob
|
import glob
|
||||||
import csv
|
import csv
|
||||||
|
from os import path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import urllib3
|
||||||
|
from collections import namedtuple
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urlparse, parse_qs,urlunparse
|
||||||
import html5lib
|
import html5lib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
# New function for pre-processing row_data
|
# New function for pre-processing row_data
|
||||||
def preprocess_row_data(row_data):
|
def preprocess_row_data(row_data):
|
||||||
# Step 1: Drop the 6th column (index=5)
|
# Step 1: Drop the 6th column (index=5)
|
||||||
|
@ -19,50 +26,116 @@ def preprocess_row_data(row_data):
|
||||||
return row_data
|
return row_data
|
||||||
|
|
||||||
|
|
||||||
|
# namedtuple to match the internal signature of urlunparse
|
||||||
|
Components = namedtuple(
|
||||||
|
typename='Components',
|
||||||
|
field_names=['scheme', 'netloc', 'url', 'path', 'query', 'fragment']
|
||||||
|
)
|
||||||
|
|
||||||
html_files = glob.glob("gazettes.uk.gov.in/showgrid*.html")
|
html_files = glob.glob("gazettes.uk.gov.in/showgrid*.html")
|
||||||
|
|
||||||
# Step 7: Create a list to store all rows across all files
|
# Step 7: Create a list to store all rows across all files
|
||||||
output_data = []
|
output_data = []
|
||||||
|
|
||||||
# Define the CSV file header
|
# Define the CSV file header
|
||||||
header = ["GO No.", "GO Date", "GO Description", "Issued by", "Gazette Week Date", "Pg No"]
|
header = [
|
||||||
|
"GO No.",
|
||||||
|
"GO Date",
|
||||||
|
"GO Description",
|
||||||
|
"Issued by",
|
||||||
|
"Gazette Week Date",
|
||||||
|
"Pg No",
|
||||||
|
]
|
||||||
|
|
||||||
|
# incomplete_links = []
|
||||||
|
|
||||||
|
# def start_js_scrape(url):
|
||||||
|
|
||||||
|
def parse_html(html):
|
||||||
|
# Step 3: Parse the HTML using BeautifulSoup
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
# Step 4: Extract the id=Datagrid1 table element
|
||||||
|
table = soup.find("table", {"id": "Datagrid1"})
|
||||||
|
|
||||||
|
# Step 5: Select all tr child elements and slice to exclude the first and last element
|
||||||
|
tr_elements = table.find_all("tr")[1:-1]
|
||||||
|
|
||||||
|
# Step 6: Extract and convert td elements to text, creating tuples for each row
|
||||||
|
rows = []
|
||||||
|
for tr in tr_elements:
|
||||||
|
td_elements = tr.find_all("td")
|
||||||
|
row_data = [td.get_text(strip=True) for td in td_elements]
|
||||||
|
|
||||||
|
# Pre-process the row_data
|
||||||
|
processed_row = preprocess_row_data(row_data)
|
||||||
|
rows.append(tuple(processed_row))
|
||||||
|
|
||||||
|
target = extract_target(table)
|
||||||
|
form = gen_form(soup, target) if target else None
|
||||||
|
|
||||||
|
return (rows,form)
|
||||||
|
|
||||||
|
def gen_form(soup, target):
|
||||||
|
inputs = soup.find_all("input")
|
||||||
|
data = {}
|
||||||
|
for input in inputs:
|
||||||
|
data[input.get("name")] = input.get("value")
|
||||||
|
data["__EVENTTARGET"] = target
|
||||||
|
del data['Button1']
|
||||||
|
return data
|
||||||
|
|
||||||
|
def extract_target(table):
|
||||||
|
target = None
|
||||||
|
links_row = table.find_all("tr")[-1]
|
||||||
|
current = links_row.find("span")
|
||||||
|
for link in current.find_next_siblings('a'):
|
||||||
|
if link.get_text(strip=True).strip().isdigit():
|
||||||
|
target = link["href"].split("'")[1]
|
||||||
|
break
|
||||||
|
return target
|
||||||
|
|
||||||
|
def file_to_url(filename):
|
||||||
|
parsed_url = urlparse(filename)
|
||||||
|
url = urlunparse(
|
||||||
|
Components(
|
||||||
|
scheme='https',
|
||||||
|
netloc='gazettes.uk.gov.in',
|
||||||
|
query=path.splitext(parsed_url.query)[0],
|
||||||
|
path='',
|
||||||
|
url='/showgrid.aspx',
|
||||||
|
fragment=''
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return url
|
||||||
|
|
||||||
|
def iter_html(html):
|
||||||
|
rows,form = parse_html(html)
|
||||||
|
yield rows
|
||||||
|
|
||||||
|
# If we need to paginate this section
|
||||||
|
if form:
|
||||||
|
url = file_to_url(file)
|
||||||
|
t = form['__EVENTTARGET']
|
||||||
|
response = http.request_encode_body('POST', url, fields=form, encode_multipart=False)
|
||||||
|
|
||||||
|
yield from iter_html(response.data)
|
||||||
|
|
||||||
|
http = urllib3.PoolManager()
|
||||||
# Step 2: Iterate through each HTML file
|
# Step 2: Iterate through each HTML file
|
||||||
for file in html_files:
|
for file in html_files:
|
||||||
with open(file, 'r', encoding='utf-8') as f:
|
print(f"Processing {file}")
|
||||||
# Step 3: Parse the HTML using BeautifulSoup
|
with open(file, "r", encoding="utf-8") as html:
|
||||||
soup = BeautifulSoup(f, 'html5lib')
|
for rows in iter_html(html):
|
||||||
|
output_data.extend(rows)
|
||||||
# Step 4: Extract the id=Datagrid1 table element
|
|
||||||
table = soup.find("table", {"id": "Datagrid1"})
|
|
||||||
|
|
||||||
# Step 5: Select all tr child elements and slice to exclude the first and last element
|
|
||||||
tr_elements = table.find_all("tr")[1:-1]
|
|
||||||
|
|
||||||
# Step 6: Extract and convert td elements to text, creating tuples for each row
|
|
||||||
rows = []
|
|
||||||
for tr in tr_elements:
|
|
||||||
td_elements = tr.find_all("td")
|
|
||||||
row_data = [td.get_text(strip=True) for td in td_elements]
|
|
||||||
|
|
||||||
# Pre-process the row_data
|
|
||||||
processed_row = preprocess_row_data(row_data)
|
|
||||||
rows.append(tuple(processed_row))
|
|
||||||
|
|
||||||
# Extend the output_data list with the rows from the current file
|
|
||||||
output_data.extend(rows)
|
|
||||||
|
|
||||||
# Sort the final list by the fifth column (Gazette Week Date)
|
# Sort the final list by the fifth column (Gazette Week Date)
|
||||||
output_data.sort(key=lambda x: x[4])
|
output_data.sort(key=lambda x: x[4])
|
||||||
|
|
||||||
# Step 8: Dump the final output list into a CSV file
|
# Step 8: Dump the final output list into a CSV file
|
||||||
with open("gazette_data.csv", 'w', newline='', encoding='utf-8') as csvfile:
|
with open("gazette_data.csv", "w", newline="", encoding="utf-8") as csvfile:
|
||||||
writer = csv.writer(csvfile)
|
writer = csv.writer(csvfile)
|
||||||
|
|
||||||
# Write the header
|
|
||||||
writer.writerow(header)
|
writer.writerow(header)
|
||||||
|
|
||||||
# Write the data
|
|
||||||
writer.writerows(output_data)
|
writer.writerows(output_data)
|
||||||
|
|
||||||
print("CSV file 'gazette_data.csv' has been created.")
|
print("CSV file 'gazette_data.csv' has been created.")
|
||||||
|
|
Loading…
Reference in New Issue