🏡 index : github.com/compromyse/enfold.git

author Raghuram Subramani <raghus2247@gmail.com> 2025-03-26 22:06:32.0 +05:30:00
committer Raghuram Subramani <raghus2247@gmail.com> 2025-03-26 22:06:32.0 +05:30:00
commit
ef63d21480f1f83a660902da3f9ad2d5606b37c2 [patch]
tree
322b1d1e8da88a62e1cfd4b0c767f53d3460203d
parent
24b38a94e36794e33a1a432ef00eaf0c46957124
download
ef63d21480f1f83a660902da3f9ad2d5606b37c2.tar.gz

multi-threaded, headless scraper



Diff

 scrape_ecourtindia_v6/.gitignore |  1 +
 scrape_ecourtindia_v6/clean.sh   |  3 +--
 scrape_ecourtindia_v6/main.py    | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 scrape_ecourtindia_v6/scraper.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore
new file mode 100644
index 0000000..ef1949c 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/.gitignore
@@ -1,0 +1,1 @@
courts.csv
diff --git a/scrape_ecourtindia_v6/clean.sh b/scrape_ecourtindia_v6/clean.sh
index bda1361..8c8a0ab 100755
--- a/scrape_ecourtindia_v6/clean.sh
+++ a/scrape_ecourtindia_v6/clean.sh
@@ -1,5 +1,4 @@
#!/usr/bin/env bash
rm -r html/* pdf/* db.json

mkdir html
mkdir pdf
mkdir -p html pdf
diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/main.py
index c81d0b6..1cadad2 100644
--- a/scrape_ecourtindia_v6/main.py
+++ a/scrape_ecourtindia_v6/main.py
@@ -1,18 +1,80 @@
import csv
from scraper import Scraper
from tinydb import TinyDB
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

db = TinyDB('db.json')

if __name__ == '__main__':
class ThreadSafeCSVWriter:
    def __init__(self, filename):
        self.file = open(filename, 'w', newline='')
        self.writer = csv.writer(self.file)
        self.lock = threading.Lock()

    def writerow(self, row):
        with self.lock:
            self.writer.writerow(row)

    def close(self):
        self.file.close()

def scrape_state_thread(state, config, csv_writer):
    scraper = Scraper(db, config)
    scraper.close_modal()
    try:
        for district in scraper.scrape_districts(state):
            for cmplx in scraper.scrape_complexes(state, district):
                csv_writer.writerow([state, district, cmplx])
    except Exception as e:
        print(f"Error scraping {state}: {e}")
    finally:
        scraper.driver.quit()

def scrape_courts():
    config = {}

    m = Scraper(db, config)
    m.close_modal()

    csv_writer = ThreadSafeCSVWriter('courts.csv')
    csv_writer.writerow(['State', 'District', 'Complex'])

    states = m.scrape_states()
    m.driver.close()

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [
            executor.submit(scrape_state_thread, state, config, csv_writer) 
            for state in states
        ]

        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"A thread encountered an error: {e}")

    csv_writer.close()

def scrape_orders():
    config = {}

    m = Scraper(db, config)
    m.close_modal()

    config['state'] = input('Select a state: ')
    config['district'] = input('Select a district: ')
    config['court_complex'] = input('Select a court complex: ')
    config['court_establishment'] = input('Select a court establishment: ')
    config['act'] = input('Select an act: ')

    m = Scraper(db, config)
    m.run()
    m.select_court()
    m.goto_acts()
    m.select_act()
    m.handle_table()

    m.driver.close()

if __name__ == '__main__':
    scrape_courts()
diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py
index 69d3336..cdab2fd 100644
--- a/scrape_ecourtindia_v6/scraper.py
+++ a/scrape_ecourtindia_v6/scraper.py
@@ -6,6 +6,7 @@

from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.select import Select

from bs4 import BeautifulSoup
@@ -19,17 +20,14 @@
        self.db = db
        self.config = config

        self.driver = Firefox()
        options = Options()
        options.add_argument("--headless")

        self.driver = Firefox(options=options)
        self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')

        self.current_view = {}

    def run(self):
        self.close_modal()
        self.goto_acts()
        self.select_act()
        self.handle_table()

    def close_modal(self):
        sleep(3)
        self.driver.execute_script('closeModel({modal_id:"validateError"})')
@@ -50,7 +48,42 @@
        self.driver.find_element(By.ID, 'radDAct').click()
        self.submit_search()

    def goto_acts(self):
    def scrape_states(self):
        element = self.driver.find_element(By.ID, 'sess_state_code')
        options = Select(element).options
        states = [ option.text for option in options[1:] ]
        print(f'STATES: {states}')

        sleep(0.2)

        return states

    def scrape_districts(self, state):
        self.select('sess_state_code', state)
        sleep(0.2)

        element = self.driver.find_element(By.ID, 'sess_dist_code')
        options = Select(element).options
        districts = [ option.text for option in options[1:] ]
        print(f'DISTRICTS: {districts}')

        return districts

    def scrape_complexes(self, state, district):
        self.select('sess_state_code', state)
        sleep(0.2)
        self.select('sess_dist_code', district)
        sleep(0.2)

        element = self.driver.find_element(By.ID, 'court_complex_code')
        options = Select(element).options
        complexes = [ option.text for option in options[1:] ]
        print(f'COMPLEXES: {complexes}')

        return complexes

    def select_court(self):
        sleep(2)
        while True:
            self.select('sess_state_code', self.config['state'])
            self.select('sess_dist_code', self.config['district'])
@@ -66,7 +99,7 @@

        self.select('court_est_code', self.config['court_establishment'])

        sleep(1)
    def goto_acts(self):
        element = self.driver.find_element(By.ID, 'act-tabMenu')
        element.click()
        sleep(1)