From ef63d21480f1f83a660902da3f9ad2d5606b37c2 Mon Sep 17 00:00:00 2001 From: Raghuram Subramani <raghus2247@gmail.com> Date: Wed, 26 Mar 2025 22:06:32 +0530 Subject: [PATCH] multi-threaded, headless scraper --- scrape_ecourtindia_v6/.gitignore | 1 + scrape_ecourtindia_v6/clean.sh | 3 +-- scrape_ecourtindia_v6/main.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- scrape_ecourtindia_v6/scraper.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 110 insertions(+), 15 deletions(-) diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore new file mode 100644 index 0000000..ef1949c 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/.gitignore @@ -1,0 +1,1 @@ +courts.csv diff --git a/scrape_ecourtindia_v6/clean.sh b/scrape_ecourtindia_v6/clean.sh index bda1361..8c8a0ab 100755 --- a/scrape_ecourtindia_v6/clean.sh +++ a/scrape_ecourtindia_v6/clean.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash rm -r html/* pdf/* db.json -mkdir html -mkdir pdf +mkdir -p html pdf diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/main.py index c81d0b6..1cadad2 100644 --- a/scrape_ecourtindia_v6/main.py +++ a/scrape_ecourtindia_v6/main.py @@ -1,18 +1,80 @@ +import csv from scraper import Scraper from tinydb import TinyDB -import os +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading db = TinyDB('db.json') -if __name__ == '__main__': +class ThreadSafeCSVWriter: + def __init__(self, filename): + self.file = open(filename, 'w', newline='') + self.writer = csv.writer(self.file) + self.lock = threading.Lock() + + def writerow(self, row): + with self.lock: + self.writer.writerow(row) + + def close(self): + self.file.close() + +def scrape_state_thread(state, config, csv_writer): + scraper = Scraper(db, config) + scraper.close_modal() + try: + for district in scraper.scrape_districts(state): + for cmplx in scraper.scrape_complexes(state, district): + csv_writer.writerow([state, district, cmplx]) + except Exception as e: + print(f"Error scraping {state}: {e}") + finally: + scraper.driver.quit() + +def scrape_courts(): + config = {} + + m = Scraper(db, config) + m.close_modal() + + csv_writer = ThreadSafeCSVWriter('courts.csv') + csv_writer.writerow(['State', 'District', 'Complex']) + + states = m.scrape_states() + m.driver.close() + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit(scrape_state_thread, state, config, csv_writer) + for state in states + ] + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + print(f"A thread encountered an error: {e}") + + csv_writer.close() + +def scrape_orders(): config = {} + m = Scraper(db, config) + m.close_modal() + config['state'] = input('Select a state: ') config['district'] = input('Select a district: ') config['court_complex'] = input('Select a court complex: ') config['court_establishment'] = input('Select a court establishment: ') config['act'] = input('Select an act: ') - m = Scraper(db, config) - m.run() + m.select_court() + m.goto_acts() + m.select_act() + m.handle_table() + m.driver.close() + +if __name__ == '__main__': + scrape_courts() diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py index 69d3336..cdab2fd 100644 --- a/scrape_ecourtindia_v6/scraper.py +++ a/scrape_ecourtindia_v6/scraper.py @@ -6,6 +6,7 @@ from selenium.webdriver import Firefox from selenium.webdriver.common.by import By +from selenium.webdriver.firefox.options import Options from selenium.webdriver.support.select import Select from bs4 import BeautifulSoup @@ -19,17 +20,14 @@ self.db = db self.config = config - self.driver = Firefox() + options = Options() + options.add_argument("--headless") + + self.driver = Firefox(options=options) self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index') self.current_view = {} - def run(self): - self.close_modal() - self.goto_acts() - self.select_act() - self.handle_table() - def close_modal(self): sleep(3) self.driver.execute_script('closeModel({modal_id:"validateError"})') @@ -50,7 +48,42 @@ self.driver.find_element(By.ID, 'radDAct').click() self.submit_search() - def goto_acts(self): + def scrape_states(self): + element = self.driver.find_element(By.ID, 'sess_state_code') + options = Select(element).options + states = [ option.text for option in options[1:] ] + print(f'STATES: {states}') + + sleep(0.2) + + return states + + def scrape_districts(self, state): + self.select('sess_state_code', state) + sleep(0.2) + + element = self.driver.find_element(By.ID, 'sess_dist_code') + options = Select(element).options + districts = [ option.text for option in options[1:] ] + print(f'DISTRICTS: {districts}') + + return districts + + def scrape_complexes(self, state, district): + self.select('sess_state_code', state) + sleep(0.2) + self.select('sess_dist_code', district) + sleep(0.2) + + element = self.driver.find_element(By.ID, 'court_complex_code') + options = Select(element).options + complexes = [ option.text for option in options[1:] ] + print(f'COMPLEXES: {complexes}') + + return complexes + + def select_court(self): + sleep(2) while True: self.select('sess_state_code', self.config['state']) self.select('sess_dist_code', self.config['district']) @@ -66,7 +99,7 @@ self.select('court_est_code', self.config['court_establishment']) - sleep(1) + def goto_acts(self): element = self.driver.find_element(By.ID, 'act-tabMenu') element.click() sleep(1) -- rgit 0.1.5