Add script to generate CSV dataset

This commit is contained in:
Nemo 2023-11-08 13:07:55 +05:30
parent 2654c9679c
commit 1e92b30bde
3 changed files with 88 additions and 1 deletions

3
.gitignore vendored
View File

@ -1,2 +1,3 @@
pdfs/
releases/
releases/
gazettes.uk.gov.in/

68
generate.py Normal file
View File

@ -0,0 +1,68 @@
import glob
import csv
from bs4 import BeautifulSoup
import html5lib
from datetime import datetime
# New function for pre-processing row_data
def preprocess_row_data(row_data):
# Step 1: Drop the 6th column (index=5)
row_data = row_data[:5] + row_data[6:]
# Step 2: Parse the second and fifth columns as dates and format them
date_format = "%d %b %Y"
go_date = datetime.strptime(row_data[1], date_format)
gazette_week_date = datetime.strptime(row_data[4], date_format)
row_data[1] = go_date.strftime("%Y-%m-%d")
row_data[4] = gazette_week_date.strftime("%Y-%m-%d")
return row_data
html_files = glob.glob("gazettes.uk.gov.in/showgrid*.html")
# Step 7: Create a list to store all rows across all files
output_data = []
# Define the CSV file header
header = ["GO No.", "GO Date", "GO Description", "Issued by", "Gazette Week Date", "Pg No"]
# Step 2: Iterate through each HTML file
for file in html_files:
with open(file, 'r', encoding='utf-8') as f:
# Step 3: Parse the HTML using BeautifulSoup
soup = BeautifulSoup(f, 'html5lib')
# Step 4: Extract the id=Datagrid1 table element
table = soup.find("table", {"id": "Datagrid1"})
# Step 5: Select all tr child elements and slice to exclude the first and last element
tr_elements = table.find_all("tr")[1:-1]
# Step 6: Extract and convert td elements to text, creating tuples for each row
rows = []
for tr in tr_elements:
td_elements = tr.find_all("td")
row_data = [td.get_text(strip=True) for td in td_elements]
# Pre-process the row_data
processed_row = preprocess_row_data(row_data)
rows.append(tuple(processed_row))
# Extend the output_data list with the rows from the current file
output_data.extend(rows)
# Sort the final list by the fifth column (Gazette Week Date)
output_data.sort(key=lambda x: x[4])
# Step 8: Dump the final output list into a CSV file
with open("gazette_data.csv", 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
# Write the header
writer.writerow(header)
# Write the data
writer.writerows(output_data)
print("CSV file 'gazette_data.csv' has been created.")

18
input.txt Normal file
View File

@ -0,0 +1,18 @@
https://gazettes.uk.gov.in/monthwise.aspx?yr=2006
https://gazettes.uk.gov.in/monthwise.aspx?yr=2007
https://gazettes.uk.gov.in/monthwise.aspx?yr=2008
https://gazettes.uk.gov.in/monthwise.aspx?yr=2009
https://gazettes.uk.gov.in/monthwise.aspx?yr=2010
https://gazettes.uk.gov.in/monthwise.aspx?yr=2011
https://gazettes.uk.gov.in/monthwise.aspx?yr=2012
https://gazettes.uk.gov.in/monthwise.aspx?yr=2013
https://gazettes.uk.gov.in/monthwise.aspx?yr=2014
https://gazettes.uk.gov.in/monthwise.aspx?yr=2015
https://gazettes.uk.gov.in/monthwise.aspx?yr=2016
https://gazettes.uk.gov.in/monthwise.aspx?yr=2017
https://gazettes.uk.gov.in/monthwise.aspx?yr=2018
https://gazettes.uk.gov.in/monthwise.aspx?yr=2019
https://gazettes.uk.gov.in/monthwise.aspx?yr=2020
https://gazettes.uk.gov.in/monthwise.aspx?yr=2021
https://gazettes.uk.gov.in/monthwise.aspx?yr=2022
https://gazettes.uk.gov.in/monthwise.aspx?yr=2023