From 0f188ea1e638e6abddb03d49b9209c703081b2fe Mon Sep 17 00:00:00 2001 From: Raghuram Subramani <raghus2247@gmail.com> Date: Mon, 31 Mar 2025 14:30:38 +0530 Subject: [PATCH] update --- flake.nix | 50 +++++++++++++++++++++++++++++++------------------- scrape_ecourtindia_v6/.gitignore | 6 ++++-- scrape_ecourtindia_v6/scrape_case_status.py | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- scrape_ecourtindia_v6/scrape_case_status_states.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scrape_ecourtindia_v6/translate_to_english.py | 42 ++++++++++++++++++++++++++++++++++++++++++ test/.gitignore | 2 ++ test/transcribe.py | 14 ++++++++++++++ scrape_ecourtindia_v6/modules/scraper.py | 13 +++++++++++-- scrape_ecourtindia_v6/modules/scraper_case_status.py | 60 +++++++++++++++++++++++++++++++----------------------------- scrape_ecourtindia_v6/results/scraping_results.csv | 1 + 10 files changed, 259 insertions(+), 141 deletions(-) diff --git a/flake.nix b/flake.nix index 807fa45..93bca92 100644 --- a/flake.nix +++ a/flake.nix @@ -1,28 +1,34 @@ { inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; outputs = { self, nixpkgs, ... }: let - pkgs = import nixpkgs { system = "x86_64-linux"; config.allowUnfree = true; }; - in { - devShells.x86_64-linux.default = pkgs.mkShell { - buildInputs = with pkgs; [ - (python3.withPackages (p: [ - p.selenium - p.opencv-python - p.pytesseract - p.beautifulsoup4 - p.tinydb - p.fastapi - p.uvicorn - p.jinja2 - ])) - pyright - - firefox - geckodriver - - tesseract - ]; - }; + system = "x86_64-linux"; + pkgs = import nixpkgs { inherit system; config.allowUnfree = true; }; + in { + devShells.${system}.default = pkgs.mkShell { + buildInputs = with pkgs; [ + (python3.withPackages (p: [ + p.selenium + p.opencv-python + p.pytesseract + p.beautifulsoup4 + p.tinydb + p.fastapi + p.uvicorn + p.jinja2 + + # p.pdf2image + # p.openai-whisper + # p.torch-bin + ])) + + pyright + + firefox + geckodriver + + tesseract + ]; }; + }; } diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore index f32422f..1aed0d4 100644 --- a/scrape_ecourtindia_v6/.gitignore +++ a/scrape_ecourtindia_v6/.gitignore @@ -1,6 +1,8 @@ -courts.csv +*.csv csv/* named_pdf/* pdf/* html/* -orders.json +bak/ +translated/* +*.json diff --git a/scrape_ecourtindia_v6/scrape_case_status.py b/scrape_ecourtindia_v6/scrape_case_status.py index 2b543ba..a8891fd 100644 --- a/scrape_ecourtindia_v6/scrape_case_status.py +++ a/scrape_ecourtindia_v6/scrape_case_status.py @@ -1,89 +1,67 @@ -import csv +from time import sleep from modules.scraper_case_status import ScraperCaseStatus -from concurrent.futures import ThreadPoolExecutor, as_completed -import threading - -SCRAPE_ESTABLISHMENTS = True - -class ThreadSafeCSVWriter: - def __init__(self, filename): - self.file = open(filename, 'w', newline='') - self.writer = csv.writer(self.file) - self.lock = threading.Lock() - - def writerow(self, row): - with self.lock: - self.writer.writerow(row) - - def close(self): - self.file.close() - -def scrape_state_thread(state, config, csv_writer): - scraper = ScraperCaseStatus(config) - scraper.close_modal() - try: - scraper.select('sess_state_code', state) - for district in scraper.scrape_districts(): - scraper.select('sess_dist_code', district) - for cmplx in scraper.scrape_complexes(): - scraper.select('court_complex_code', cmplx) - if SCRAPE_ESTABLISHMENTS: - establishments = [] - for establishment in scraper.scrape_establishments(): - establishments.append(establishment) - - csv_writer.writerow([ state, district, cmplx ] + establishments) - else: - csv_writer.writerow([ state, district, cmplx ]) - except Exception as e: - print(f"Error scraping {state}: {e}") - finally: - scraper.driver.quit() - -def scrape_courts(): - config = {} - - m = ScraperCaseStatus(config) - m.close_modal() - - csv_writer = ThreadSafeCSVWriter('csv/courts.csv') - csv_writer.writerow(['State', 'District', 'Complex']) - - states = m.scrape_states() - m.driver.close() - - with ThreadPoolExecutor(max_workers=5) as executor: - futures = [ - executor.submit(scrape_state_thread, state, config, csv_writer) - for state in states - ] - - for future in as_completed(futures): - try: - future.result() - except Exception as e: - print(f"A thread encountered an error: {e}") - - csv_writer.close() +from tinydb import TinyDB -def scrape_orders(): - config = {} +db = TinyDB('db.json') - m = ScraperCaseStatus(config) - m.close_modal() +scraper = ScraperCaseStatus() - config['state'] = input('Select a state: ') - config['district'] = input('Select a district: ') - config['court_complex'] = input('Select a court complex: ') - config['court_establishment'] = input('Select a court establishment: ') - config['act'] = input('Select an act: ') +state = 'Karnataka' +act = 'Juvenile Justice (Care and Protection of Children) Act, 2015' - m.select_court() - m.goto_acts() - m.select_act() - m.handle_table() +scraper.close_modal() +scraper.select('sess_state_code', state) +sleep(1) - m.driver.close() +for district in scraper.scrape_districts(): + print(f'SELECTING DISTRICT {district}') + while True: + try: + scraper.close_modal() + scraper.select('sess_dist_code', district) + break + except: + pass + sleep(1) + + for cmplx in scraper.scrape_complexes(): + sleep(1) + print(f'SELECTING COMPLEX {cmplx}') + while True: + try: + scraper.close_modal() + scraper.select('court_complex_code', cmplx) + break + except: + pass + try: + scraper.driver.switch_to.alert.accept(); + scraper.close_modal() + except: + pass + + for establishment in scraper.scrape_establishments(): + sleep(1) + print(f'SELECTING ESTABLISHMENT {establishment}') + while True: + try: + scraper.close_modal() + scraper.select('court_est_code', establishment) + break + except Exception as e: + print("EXCEPTION HANDLED:") + print(e) + + sleep(1) + scraper.close_modal() + + sleep(1) + scraper.goto_acts() + try: + scraper.select_act(act) + scraper.handle_table(db) + except Exception as e: + print("EXCEPTION HANDLED:") + print(e) -if __name__ == '__main__': - scrape_courts() +scraper.driver.close() diff --git a/scrape_ecourtindia_v6/scrape_case_status_states.py b/scrape_ecourtindia_v6/scrape_case_status_states.py new file mode 100644 index 0000000..e75af84 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/scrape_case_status_states.py @@ -1,0 +1,70 @@ +import csv +from modules.scraper_case_status import ScraperCaseStatus +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading + +SCRAPE_ESTABLISHMENTS = True + +class ThreadSafeCSVWriter: + def __init__(self, filename): + self.file = open(filename, 'w', newline='') + self.writer = csv.writer(self.file) + self.lock = threading.Lock() + + def writerow(self, row): + with self.lock: + self.writer.writerow(row) + + def close(self): + self.file.close() + +def scrape_state_thread(state, config, csv_writer): + scraper = ScraperCaseStatus(config) + scraper.close_modal() + try: + scraper.select('sess_state_code', state) + for district in scraper.scrape_districts(): + scraper.select('sess_dist_code', district) + for cmplx in scraper.scrape_complexes(): + scraper.select('court_complex_code', cmplx) + if SCRAPE_ESTABLISHMENTS: + establishments = [] + for establishment in scraper.scrape_establishments(): + establishments.append(establishment) + + csv_writer.writerow([ state, district, cmplx ] + establishments) + else: + csv_writer.writerow([ state, district, cmplx ]) + except Exception as e: + print(f"Error scraping {state}: {e}") + finally: + scraper.driver.quit() + +def scrape_courts(): + config = {} + + m = ScraperCaseStatus(config) + m.close_modal() + + csv_writer = ThreadSafeCSVWriter('csv/courts.csv') + csv_writer.writerow(['State', 'District', 'Complex']) + + states = m.scrape_states() + m.driver.close() + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit(scrape_state_thread, state, config, csv_writer) + for state in states + ] + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + print(f"A thread encountered an error: {e}") + + csv_writer.close() + +if __name__ == '__main__': + scrape_courts() diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py new file mode 100644 index 0000000..485a4b8 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/translate_to_english.py @@ -1,0 +1,42 @@ +from tempfile import TemporaryDirectory + +import pytesseract +from pdf2image import convert_from_path +from PIL import Image + +from tinydb import TinyDB + +language = 'hin' + +def to_english(input_file, output_file): + image_file_list = [] + + with TemporaryDirectory() as tempdir: + pdf_pages = convert_from_path(input_file, 500) + + for page_enumeration, page in enumerate(pdf_pages, start=1): + filename = f"{tempdir}/page_{page_enumeration}.jpg" + page.save(filename, "JPEG") + image_file_list.append(filename) + + with open(output_file, "a") as h: + for image_file in image_file_list: + text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language)))) + + # In many PDFs, at line ending, if a word can't + # be written fully, a 'hyphen' is added. + # The rest of the word is written in the next line + # Eg: This is a sample text this word here GeeksF- + # orGeeks is half on first line, remaining on next. + # To remove this, we replace every '-\n' to ''. + text = text.replace("-\n", "") + + breakpoint() + + h.write(text) + +db = TinyDB('orders.json') +entries = db.all() + +for entry in entries: + to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt') diff --git a/test/.gitignore b/test/.gitignore new file mode 100644 index 0000000..818a333 100644 --- /dev/null +++ a/test/.gitignore @@ -1,0 +1,2 @@ +*.txt +*.mp3 diff --git a/test/transcribe.py b/test/transcribe.py new file mode 100644 index 0000000..c64f425 100644 --- /dev/null +++ a/test/transcribe.py @@ -1,0 +1,14 @@ +import os +import whisper + +def transcribe_audio(audio_file_path, model_path): + model = whisper.load_model(model_path) + result = model.transcribe(audio_file_path) + text_file_path = os.path.splitext(audio_file_path)[0] + ".txt" + with open(text_file_path, "w") as text_file: + text_file.write(result['text']) + +audio_file_path = 'test.mp3' + +if audio_file_path is not None: + transcribe_audio(audio_file_path, model_path='medium') diff --git a/scrape_ecourtindia_v6/modules/scraper.py b/scrape_ecourtindia_v6/modules/scraper.py index 4616763..140302e 100644 --- a/scrape_ecourtindia_v6/modules/scraper.py +++ a/scrape_ecourtindia_v6/modules/scraper.py @@ -20,8 +20,14 @@ sleep(1) def select(self, i_d, value): - sleep(1) - element = self.driver.find_element(By.ID, i_d) + while True: + try: + element = self.driver.find_element(By.ID, i_d) + break + except: + sleep(0.2) + pass + select = Select(element) select.select_by_visible_text(value) sleep(1) @@ -51,6 +57,9 @@ print(f'COMPLEXES: {complexes}') return complexes + + def establishments_visible(self): + return self.driver.find_element(By.ID, 'court_est_code').is_displayed() def scrape_establishments(self): element = self.driver.find_element(By.ID, 'court_est_code') diff --git a/scrape_ecourtindia_v6/modules/scraper_case_status.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py index 684d9d7..b4a9ec3 100644 --- a/scrape_ecourtindia_v6/modules/scraper_case_status.py +++ a/scrape_ecourtindia_v6/modules/scraper_case_status.py @@ -5,7 +5,6 @@ from urllib import request from selenium.webdriver.common.by import By -from selenium.webdriver.support.select import Select from bs4 import BeautifulSoup @@ -13,45 +12,30 @@ import pytesseract import tempfile -from tinydb import TinyDB - from .scraper import Scraper class ScraperCaseStatus(Scraper): - def __init__(self, config): - Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index') - - self.db = TinyDB('db.json') - self.config = config - - def select_act(self): - self.select('actcode', self.config['act']) + def __init__(self): + Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index', headless=False) + + def select_act(self, act): + self.select('actcode', act) sleep(1) # Disposed only self.driver.find_element(By.ID, 'radDAct').click() self.submit_search() - def select_court(self): - sleep(2) + def goto_acts(self): while True: - self.select('sess_state_code', self.config['state']) - self.select('sess_dist_code', self.config['district']) - self.select('court_complex_code', self.config['court_complex']) - - sleep(2) - modal_is_open = self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed() - if modal_is_open: + try: self.close_modal() - continue - - break - - self.select('court_est_code', self.config['court_establishment']) + element = self.driver.find_element(By.ID, 'act-tabMenu') + element.click() + break + except: + pass - def goto_acts(self): - element = self.driver.find_element(By.ID, 'act-tabMenu') - element.click() sleep(1) def submit_search(self): @@ -76,9 +60,13 @@ element.clear() else: captcha_incomplete = False + + def handle_table(self, db): + try: + table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') + except: + return - def handle_table(self): - table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') self.views = [] i = 5 @@ -109,7 +97,7 @@ self.parse_orders_table() - self.db.insert(self.current_view) + db.insert(self.current_view) print(f'INSERTED: {self.current_view}') self.driver.find_element(By.ID, 'main_back_act').click() i += 4 @@ -134,7 +122,7 @@ script = order.find_all('a')[0].get_attribute_list('onclick')[0] self.driver.execute_script(script) - sleep(0.7) + sleep(1) obj = self.driver.find_element(By.TAG_NAME, 'object') pdf_url = str(obj.get_attribute('data')) @@ -153,4 +141,10 @@ except: print(f'UNABLE TO FETCH PDF: {pdf_url}') - self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() + sleep(1) + while True: + try: + self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() + break + except: + pass diff --git a/scrape_ecourtindia_v6/results/scraping_results.csv b/scrape_ecourtindia_v6/results/scraping_results.csv new file mode 100644 index 0000000..35dff1a 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/results/scraping_results.csv @@ -1,0 +1,1 @@ +State,District,Complex,Establishment,Records -- rgit 0.1.5