multi-threaded, headless scraper
Diff
scrape_ecourtindia_v6/.gitignore | 1 +
scrape_ecourtindia_v6/clean.sh | 3 +--
scrape_ecourtindia_v6/main.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
scrape_ecourtindia_v6/scraper.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++++---
4 files changed, 110 insertions(+), 15 deletions(-)
@@ -1,0 +1,1 @@
courts.csv
@@ -1,5 +1,4 @@
rm -r html/* pdf/* db.json
mkdir html
mkdir pdf
mkdir -p html pdf
@@ -1,18 +1,80 @@
import csv
from scraper import Scraper
from tinydb import TinyDB
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
db = TinyDB('db.json')
if __name__ == '__main__':
class ThreadSafeCSVWriter:
def __init__(self, filename):
self.file = open(filename, 'w', newline='')
self.writer = csv.writer(self.file)
self.lock = threading.Lock()
def writerow(self, row):
with self.lock:
self.writer.writerow(row)
def close(self):
self.file.close()
def scrape_state_thread(state, config, csv_writer):
scraper = Scraper(db, config)
scraper.close_modal()
try:
for district in scraper.scrape_districts(state):
for cmplx in scraper.scrape_complexes(state, district):
csv_writer.writerow([state, district, cmplx])
except Exception as e:
print(f"Error scraping {state}: {e}")
finally:
scraper.driver.quit()
def scrape_courts():
config = {}
m = Scraper(db, config)
m.close_modal()
csv_writer = ThreadSafeCSVWriter('courts.csv')
csv_writer.writerow(['State', 'District', 'Complex'])
states = m.scrape_states()
m.driver.close()
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
executor.submit(scrape_state_thread, state, config, csv_writer)
for state in states
]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
print(f"A thread encountered an error: {e}")
csv_writer.close()
def scrape_orders():
config = {}
m = Scraper(db, config)
m.close_modal()
config['state'] = input('Select a state: ')
config['district'] = input('Select a district: ')
config['court_complex'] = input('Select a court complex: ')
config['court_establishment'] = input('Select a court establishment: ')
config['act'] = input('Select an act: ')
m = Scraper(db, config)
m.run()
m.select_court()
m.goto_acts()
m.select_act()
m.handle_table()
m.driver.close()
if __name__ == '__main__':
scrape_courts()
@@ -6,6 +6,7 @@
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
@@ -19,17 +20,14 @@
self.db = db
self.config = config
self.driver = Firefox()
options = Options()
options.add_argument("--headless")
self.driver = Firefox(options=options)
self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')
self.current_view = {}
def run(self):
self.close_modal()
self.goto_acts()
self.select_act()
self.handle_table()
def close_modal(self):
sleep(3)
self.driver.execute_script('closeModel({modal_id:"validateError"})')
@@ -50,7 +48,42 @@
self.driver.find_element(By.ID, 'radDAct').click()
self.submit_search()
def goto_acts(self):
def scrape_states(self):
element = self.driver.find_element(By.ID, 'sess_state_code')
options = Select(element).options
states = [ option.text for option in options[1:] ]
print(f'STATES: {states}')
sleep(0.2)
return states
def scrape_districts(self, state):
self.select('sess_state_code', state)
sleep(0.2)
element = self.driver.find_element(By.ID, 'sess_dist_code')
options = Select(element).options
districts = [ option.text for option in options[1:] ]
print(f'DISTRICTS: {districts}')
return districts
def scrape_complexes(self, state, district):
self.select('sess_state_code', state)
sleep(0.2)
self.select('sess_dist_code', district)
sleep(0.2)
element = self.driver.find_element(By.ID, 'court_complex_code')
options = Select(element).options
complexes = [ option.text for option in options[1:] ]
print(f'COMPLEXES: {complexes}')
return complexes
def select_court(self):
sleep(2)
while True:
self.select('sess_state_code', self.config['state'])
self.select('sess_dist_code', self.config['district'])
@@ -66,7 +99,7 @@
self.select('court_est_code', self.config['court_establishment'])
sleep(1)
def goto_acts(self):
element = self.driver.find_element(By.ID, 'act-tabMenu')
element.click()
sleep(1)