Add script to generate CSV dataset
This commit is contained in:
parent
2654c9679c
commit
1e92b30bde
|
@ -1,2 +1,3 @@
|
|||
pdfs/
|
||||
releases/
|
||||
releases/
|
||||
gazettes.uk.gov.in/
|
|
@ -0,0 +1,68 @@
|
|||
import glob
|
||||
import csv
|
||||
from bs4 import BeautifulSoup
|
||||
import html5lib
|
||||
from datetime import datetime
|
||||
|
||||
# New function for pre-processing row_data
|
||||
def preprocess_row_data(row_data):
|
||||
# Step 1: Drop the 6th column (index=5)
|
||||
row_data = row_data[:5] + row_data[6:]
|
||||
|
||||
# Step 2: Parse the second and fifth columns as dates and format them
|
||||
date_format = "%d %b %Y"
|
||||
go_date = datetime.strptime(row_data[1], date_format)
|
||||
gazette_week_date = datetime.strptime(row_data[4], date_format)
|
||||
row_data[1] = go_date.strftime("%Y-%m-%d")
|
||||
row_data[4] = gazette_week_date.strftime("%Y-%m-%d")
|
||||
|
||||
return row_data
|
||||
|
||||
|
||||
html_files = glob.glob("gazettes.uk.gov.in/showgrid*.html")
|
||||
|
||||
# Step 7: Create a list to store all rows across all files
|
||||
output_data = []
|
||||
|
||||
# Define the CSV file header
|
||||
header = ["GO No.", "GO Date", "GO Description", "Issued by", "Gazette Week Date", "Pg No"]
|
||||
|
||||
# Step 2: Iterate through each HTML file
|
||||
for file in html_files:
|
||||
with open(file, 'r', encoding='utf-8') as f:
|
||||
# Step 3: Parse the HTML using BeautifulSoup
|
||||
soup = BeautifulSoup(f, 'html5lib')
|
||||
|
||||
# Step 4: Extract the id=Datagrid1 table element
|
||||
table = soup.find("table", {"id": "Datagrid1"})
|
||||
|
||||
# Step 5: Select all tr child elements and slice to exclude the first and last element
|
||||
tr_elements = table.find_all("tr")[1:-1]
|
||||
|
||||
# Step 6: Extract and convert td elements to text, creating tuples for each row
|
||||
rows = []
|
||||
for tr in tr_elements:
|
||||
td_elements = tr.find_all("td")
|
||||
row_data = [td.get_text(strip=True) for td in td_elements]
|
||||
|
||||
# Pre-process the row_data
|
||||
processed_row = preprocess_row_data(row_data)
|
||||
rows.append(tuple(processed_row))
|
||||
|
||||
# Extend the output_data list with the rows from the current file
|
||||
output_data.extend(rows)
|
||||
|
||||
# Sort the final list by the fifth column (Gazette Week Date)
|
||||
output_data.sort(key=lambda x: x[4])
|
||||
|
||||
# Step 8: Dump the final output list into a CSV file
|
||||
with open("gazette_data.csv", 'w', newline='', encoding='utf-8') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
|
||||
# Write the header
|
||||
writer.writerow(header)
|
||||
|
||||
# Write the data
|
||||
writer.writerows(output_data)
|
||||
|
||||
print("CSV file 'gazette_data.csv' has been created.")
|
|
@ -0,0 +1,18 @@
|
|||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2006
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2007
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2008
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2009
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2010
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2011
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2012
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2013
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2014
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2015
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2016
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2017
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2018
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2019
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2020
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2021
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2022
|
||||
https://gazettes.uk.gov.in/monthwise.aspx?yr=2023
|
Loading…
Reference in New Issue