From a02c8f4c8643b4b9a531e185813c5d82b6866ec0 Mon Sep 17 00:00:00 2001 From: Raghuram Subramani <raghus2247@gmail.com> Date: Thu, 27 Mar 2025 23:05:13 +0530 Subject: [PATCH] update --- scrape_ecourtindia_v6/orders_scrape_courts.py | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scrape_ecourtindia_v6/scrape_orders.py | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------- scrape_ecourtindia_v6/modules/scraper_case_status.py | 2 +- scrape_ecourtindia_v6/modules/scraper_orders.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 257 insertions(+), 74 deletions(-) diff --git a/scrape_ecourtindia_v6/orders_scrape_courts.py b/scrape_ecourtindia_v6/orders_scrape_courts.py new file mode 100644 index 0000000..597ce9f 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/orders_scrape_courts.py @@ -1,0 +1,130 @@ +import csv +from time import sleep +from modules.scraper_orders import ScraperOrders +from selenium.webdriver.common.by import By +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading + +class ThreadSafeCSVWriter: + def __init__(self, filename): + self.file = open(filename, 'w', newline='') + self.writer = csv.writer(self.file) + self.lock = threading.Lock() + + def writerow(self, row): + with self.lock: + self.writer.writerow(row) + print(f'Wrote: {row}') + + def close(self): + self.file.close() + +def scrape_district(state, district, csv_writer): + try: + config = {} + scraper = ScraperOrders(config) + scraper.close_modal() + + scraper.select('sess_state_code', state) + scraper.select('sess_dist_code', district) + + complexes = scraper.scrape_complexes() + scraper.select('court_complex_code', complexes[0]) + + sleep(2) + scraper.goto_courtnumber() + + for cmplx in complexes: + while True: + sleep(0.5) + try: + modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed() + if modal_is_open: + scraper.close_modal() + continue + break + except: + break + + scraper.select('court_complex_code', cmplx) + sleep(0.5) + + court_numbers = scraper.get_court_numbers() + for court_number in court_numbers: + row = [state, district, cmplx, court_number] + csv_writer.writerow(row) + + scraper.driver.quit() + + except Exception as e: + print(f"Error scraping district {district}: {e}") + +def scrape_courts(): + state = 'Uttar Pradesh' + + config = {} + scraper = ScraperOrders(config) + scraper.close_modal() + scraper.select('sess_state_code', state) + + districts = scraper.scrape_districts() + scraper.driver.quit() + + csv_writer = ThreadSafeCSVWriter('csv/court_numbers.csv') + csv_writer.writerow(['State', 'District', 'Cmplx', 'Court number']) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit(scrape_district, state, district, csv_writer) + for district in districts + ] + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + print(f"A thread encountered an error: {e}") + + csv_writer.close() + +def scrape_orders(courts): + csvfile = open(courts, newline='') + reader = csv.reader(csvfile) + + for row in reader: + print(row) + config = {} + scraper = ScraperOrders(config) + scraper.close_modal() + + scraper.select('sess_state_code', row[0]) + scraper.select('sess_dist_code', row[1]) + + while True: + sleep(0.5) + try: + modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed() + if modal_is_open: + scraper.close_modal() + continue + break + except: + break + + scraper.select('court_complex_code', row[2]) + sleep(1) + scraper.goto_courtnumber() + + scraper.select('nnjudgecode1', row[3]) + scraper.driver.find_element(By.ID, 'radBoth2').click() + scraper.submit_search() + + scraper.parse_orders_table() + scraper.handle_orders() + + break + + csvfile.close() + +if __name__ == '__main__': + scrape_orders('csv/2023-24_pocso.csv') diff --git a/scrape_ecourtindia_v6/scrape_orders.py b/scrape_ecourtindia_v6/scrape_orders.py index 74cdd56..54a2d80 100644 --- a/scrape_ecourtindia_v6/scrape_orders.py +++ a/scrape_ecourtindia_v6/scrape_orders.py @@ -1,82 +1,71 @@ import csv from time import sleep + +from tinydb import TinyDB from modules.scraper_orders import ScraperOrders from selenium.webdriver.common.by import By from concurrent.futures import ThreadPoolExecutor, as_completed + import threading -class ThreadSafeCSVWriter: - def __init__(self, filename): - self.file = open(filename, 'w', newline='') - self.writer = csv.writer(self.file) +class ThreadSafeDB: + def __init__(self): + self.db = TinyDB('orders.json') self.lock = threading.Lock() - - def writerow(self, row): + + def insert(self, record): with self.lock: - self.writer.writerow(row) - print(f'Wrote: {row}') + self.db.insert(record) + print(f'INSERTED: {record}') - def close(self): - self.file.close() +db = ThreadSafeDB() -def scrape_district(state, district, csv_writer): +def scrape_single_court(row): try: config = {} - scraper = ScraperOrders(config) + scraper = ScraperOrders(db, config) scraper.close_modal() - scraper.select('sess_state_code', state) - scraper.select('sess_dist_code', district) - - complexes = scraper.scrape_complexes() - scraper.select('court_complex_code', complexes[0]) - - sleep(2) - scraper.goto_courtnumber() - - for cmplx in complexes: - while True: - sleep(0.5) - try: - modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed() - if modal_is_open: - scraper.close_modal() - continue - break - except: - break - - scraper.select('court_complex_code', cmplx) + scraper.select('sess_state_code', row[0]) + scraper.select('sess_dist_code', row[1]) + + while True: sleep(0.5) - - court_numbers = scraper.get_court_numbers() - for court_number in court_numbers: - row = [state, district, cmplx, court_number] - csv_writer.writerow(row) + try: + modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed() + if modal_is_open: + scraper.close_modal() + continue + break + except: + break + scraper.select('court_complex_code', row[2]) + sleep(1) + scraper.goto_courtnumber() + scraper.select('nnjudgecode1', row[3]) + + scraper.driver.find_element(By.ID, 'radBoth2').click() + + scraper.submit_search() + scraper.parse_orders_table() + scraper.handle_orders() + scraper.driver.quit() except Exception as e: - print(f"Error scraping district {district}: {e}") - -def scrape_courts(): - state = 'Uttar Pradesh' - - config = {} - scraper = ScraperOrders(config) - scraper.close_modal() - scraper.select('sess_state_code', state) + print(f"Error processing court {row}: {e}") + +def scrape_orders(courts_csv): + with open(courts_csv, newline='') as csvfile: + reader = csv.reader(csvfile) + next(reader, None) + courts = list(reader) - districts = scraper.scrape_districts() - scraper.driver.quit() - - csv_writer = ThreadSafeCSVWriter('csv/court_numbers.csv') - csv_writer.writerow(['State', 'District', 'Cmplx', 'Court number']) - with ThreadPoolExecutor(max_workers=5) as executor: futures = [ - executor.submit(scrape_district, state, district, csv_writer) - for district in districts + executor.submit(scrape_single_court, court) + for court in courts ] for future in as_completed(futures): @@ -85,16 +74,6 @@ except Exception as e: print(f"A thread encountered an error: {e}") - csv_writer.close() - -def scrape_orders(courts): - csvfile = open(courts, newline='') - reader = csv.reader(csvfile) - - for row in reader: - print(row) - - csvfile.close() - if __name__ == '__main__': - scrape_orders('csv/2023-24_pocso.csv') + input_file = 'csv/2023-24_pocso.csv' + scrape_orders(input_file) diff --git a/scrape_ecourtindia_v6/modules/scraper_case_status.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py index d9b925d..684d9d7 100644 --- a/scrape_ecourtindia_v6/modules/scraper_case_status.py +++ a/scrape_ecourtindia_v6/modules/scraper_case_status.py @@ -134,7 +134,7 @@ script = order.find_all('a')[0].get_attribute_list('onclick')[0] self.driver.execute_script(script) - sleep(2) + sleep(0.7) obj = self.driver.find_element(By.TAG_NAME, 'object') pdf_url = str(obj.get_attribute('data')) diff --git a/scrape_ecourtindia_v6/modules/scraper_orders.py b/scrape_ecourtindia_v6/modules/scraper_orders.py index 78594e8..a9fe7be 100644 --- a/scrape_ecourtindia_v6/modules/scraper_orders.py +++ a/scrape_ecourtindia_v6/modules/scraper_orders.py @@ -1,17 +1,25 @@ from time import sleep +import tempfile +import uuid +import os +from urllib import request + +from bs4 import BeautifulSoup + +import cv2 +import pytesseract + from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select -from tinydb import TinyDB - from .scraper import Scraper class ScraperOrders(Scraper): - def __init__(self, config): - Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index') + def __init__(self, db, config): + Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index', headless=True) - self.db = TinyDB('db.json') + self.db = db self.config = config def goto_courtnumber(self): @@ -27,3 +35,69 @@ print(f'COURT NUMBERS: {court_numbers}') return court_numbers + + def submit_search(self): + captcha_incomplete = True + while captcha_incomplete: + img = self.driver.find_element(By.ID, 'captcha_image') + temp = tempfile.NamedTemporaryFile(suffix='.png') + img.screenshot(temp.name) + + img = cv2.imread(temp.name) + text = pytesseract.image_to_string(img).strip() + + element = self.driver.find_element(By.ID, 'order_no_captcha_code') + element.send_keys(text) + + self.driver.execute_script('submitCourtNumber()') + sleep(3) + + if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed(): + self.close_modal() + element.clear() + else: + captcha_incomplete = False + + def parse_orders_table(self): + try: + table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') + except: + return + + rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') + self.rows = [] + i = 6 + while i < len(rows): + self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ]) + i += 5 + + def handle_orders(self): + for row in self.rows: + order = row[0] + + script = order.find_all('a')[0].get_attribute_list('onclick')[0] + self.driver.execute_script(script) + + sleep(0.7) + obj = self.driver.find_elements(By.TAG_NAME, 'object')[-1] + pdf_url = str(obj.get_attribute('data')) + + while True: + filename = f"pdf/{uuid.uuid4().hex}.pdf" + if not os.path.exists(filename): + break + + cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()]) + r = request.Request(pdf_url) + r.add_header("Cookie", cookies) + + try: + with request.urlopen(r) as response, open(filename, "wb") as file: + file.write(response.read()) + except: + print(f'UNABLE TO FETCH PDF: {pdf_url}') + + record = { 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename } + self.db.insert(record) + + self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() -- rgit 0.1.5