From 0f188ea1e638e6abddb03d49b9209c703081b2fe Mon Sep 17 00:00:00 2001
From: Raghuram Subramani <raghus2247@gmail.com>
Date: Mon, 31 Mar 2025 14:30:38 +0530
Subject: [PATCH] update

---
 flake.nix                                            |  50 +++++++++++++++++++++++++++++++-------------------
 scrape_ecourtindia_v6/.gitignore                     |   6 ++++--
 scrape_ecourtindia_v6/scrape_case_status.py          | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
 scrape_ecourtindia_v6/scrape_case_status_states.py   |  70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 scrape_ecourtindia_v6/translate_to_english.py        |  42 ++++++++++++++++++++++++++++++++++++++++++
 test/.gitignore                                      |   2 ++
 test/transcribe.py                                   |  14 ++++++++++++++
 scrape_ecourtindia_v6/modules/scraper.py             |  13 +++++++++++--
 scrape_ecourtindia_v6/modules/scraper_case_status.py |  60 +++++++++++++++++++++++++++++++-----------------------------
 scrape_ecourtindia_v6/results/scraping_results.csv   |   1 +
 10 files changed, 259 insertions(+), 141 deletions(-)

diff --git a/flake.nix b/flake.nix
index 807fa45..93bca92 100644
--- a/flake.nix
+++ a/flake.nix
@@ -1,28 +1,34 @@
 {
   inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
 
   outputs = { self, nixpkgs, ... }: let
-      pkgs = import nixpkgs { system = "x86_64-linux"; config.allowUnfree = true; };
-    in {
-      devShells.x86_64-linux.default = pkgs.mkShell {
-        buildInputs = with pkgs; [
-          (python3.withPackages (p: [
-            p.selenium
-            p.opencv-python
-            p.pytesseract
-            p.beautifulsoup4
-            p.tinydb
-            p.fastapi
-            p.uvicorn
-            p.jinja2
-          ]))
-          pyright
-
-          firefox
-          geckodriver
-
-          tesseract
-        ];
-      };
+    system = "x86_64-linux";
+    pkgs = import nixpkgs { inherit system; config.allowUnfree = true; };
+  in {
+    devShells.${system}.default = pkgs.mkShell {
+      buildInputs = with pkgs; [
+        (python3.withPackages (p: [
+          p.selenium
+          p.opencv-python
+          p.pytesseract
+          p.beautifulsoup4
+          p.tinydb
+          p.fastapi
+          p.uvicorn
+          p.jinja2
+
+          # p.pdf2image
+          # p.openai-whisper
+          # p.torch-bin
+        ]))
+
+        pyright
+
+        firefox
+        geckodriver
+
+        tesseract
+      ];
     };
+  };
 }
diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore
index f32422f..1aed0d4 100644
--- a/scrape_ecourtindia_v6/.gitignore
+++ a/scrape_ecourtindia_v6/.gitignore
@@ -1,6 +1,8 @@
-courts.csv
+*.csv
 csv/*
 named_pdf/*
 pdf/*
 html/*
-orders.json
+bak/
+translated/*
+*.json
diff --git a/scrape_ecourtindia_v6/scrape_case_status.py b/scrape_ecourtindia_v6/scrape_case_status.py
index 2b543ba..a8891fd 100644
--- a/scrape_ecourtindia_v6/scrape_case_status.py
+++ a/scrape_ecourtindia_v6/scrape_case_status.py
@@ -1,89 +1,67 @@
-import csv
+from time import sleep
 from modules.scraper_case_status import ScraperCaseStatus
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import threading
-
-SCRAPE_ESTABLISHMENTS = True
-
-class ThreadSafeCSVWriter:
-    def __init__(self, filename):
-        self.file = open(filename, 'w', newline='')
-        self.writer = csv.writer(self.file)
-        self.lock = threading.Lock()
-
-    def writerow(self, row):
-        with self.lock:
-            self.writer.writerow(row)
-
-    def close(self):
-        self.file.close()
-
-def scrape_state_thread(state, config, csv_writer):
-    scraper = ScraperCaseStatus(config)
-    scraper.close_modal()
-    try:
-        scraper.select('sess_state_code', state)
-        for district in scraper.scrape_districts():
-            scraper.select('sess_dist_code', district)
-            for cmplx in scraper.scrape_complexes():
-                scraper.select('court_complex_code', cmplx)
-                if SCRAPE_ESTABLISHMENTS:
-                    establishments = []
-                    for establishment in scraper.scrape_establishments():
-                        establishments.append(establishment)
-
-                    csv_writer.writerow([ state, district, cmplx ] + establishments)
-                else:
-                    csv_writer.writerow([ state, district, cmplx ])
-    except Exception as e:
-        print(f"Error scraping {state}: {e}")
-    finally:
-        scraper.driver.quit()
-
-def scrape_courts():
-    config = {}
-
-    m = ScraperCaseStatus(config)
-    m.close_modal()
-
-    csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
-    csv_writer.writerow(['State', 'District', 'Complex'])
-
-    states = m.scrape_states()
-    m.driver.close()
-
-    with ThreadPoolExecutor(max_workers=5) as executor:
-        futures = [
-            executor.submit(scrape_state_thread, state, config, csv_writer) 
-            for state in states
-        ]
-
-        for future in as_completed(futures):
-            try:
-                future.result()
-            except Exception as e:
-                print(f"A thread encountered an error: {e}")
-
-    csv_writer.close()
+from tinydb import TinyDB
 
-def scrape_orders():
-    config = {}
+db = TinyDB('db.json')
 
-    m = ScraperCaseStatus(config)
-    m.close_modal()
+scraper = ScraperCaseStatus()
 
-    config['state'] = input('Select a state: ')
-    config['district'] = input('Select a district: ')
-    config['court_complex'] = input('Select a court complex: ')
-    config['court_establishment'] = input('Select a court establishment: ')
-    config['act'] = input('Select an act: ')
+state = 'Karnataka'
+act = 'Juvenile Justice (Care and Protection of Children) Act, 2015'
 
-    m.select_court()
-    m.goto_acts()
-    m.select_act()
-    m.handle_table()
+scraper.close_modal()
+scraper.select('sess_state_code', state)
+sleep(1)
 
-    m.driver.close()
+for district in scraper.scrape_districts():
+    print(f'SELECTING DISTRICT {district}')
+    while True:
+        try:
+            scraper.close_modal()
+            scraper.select('sess_dist_code', district)
+            break
+        except:
+            pass
+    sleep(1)
+
+    for cmplx in scraper.scrape_complexes():
+        sleep(1)
+        print(f'SELECTING COMPLEX {cmplx}')
+        while True:
+            try:
+                scraper.close_modal()
+                scraper.select('court_complex_code', cmplx)
+                break
+            except:
+                pass
+        try:
+            scraper.driver.switch_to.alert.accept();
+            scraper.close_modal()
+        except:
+            pass
+
+        for establishment in scraper.scrape_establishments():
+            sleep(1)
+            print(f'SELECTING ESTABLISHMENT {establishment}')
+            while True:
+                try:
+                    scraper.close_modal()
+                    scraper.select('court_est_code', establishment)
+                    break
+                except Exception as e:
+                    print("EXCEPTION HANDLED:")
+                    print(e)
+
+            sleep(1)
+            scraper.close_modal()
+
+            sleep(1)
+            scraper.goto_acts()
+            try:
+                scraper.select_act(act)
+                scraper.handle_table(db)
+            except Exception as e:
+                    print("EXCEPTION HANDLED:")
+                    print(e)
 
-if __name__ == '__main__':
-    scrape_courts()
+scraper.driver.close()
diff --git a/scrape_ecourtindia_v6/scrape_case_status_states.py b/scrape_ecourtindia_v6/scrape_case_status_states.py
new file mode 100644
index 0000000..e75af84 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/scrape_case_status_states.py
@@ -1,0 +1,70 @@
+import csv
+from modules.scraper_case_status import ScraperCaseStatus
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+SCRAPE_ESTABLISHMENTS = True
+
+class ThreadSafeCSVWriter:
+    def __init__(self, filename):
+        self.file = open(filename, 'w', newline='')
+        self.writer = csv.writer(self.file)
+        self.lock = threading.Lock()
+
+    def writerow(self, row):
+        with self.lock:
+            self.writer.writerow(row)
+
+    def close(self):
+        self.file.close()
+
+def scrape_state_thread(state, config, csv_writer):
+    scraper = ScraperCaseStatus(config)
+    scraper.close_modal()
+    try:
+        scraper.select('sess_state_code', state)
+        for district in scraper.scrape_districts():
+            scraper.select('sess_dist_code', district)
+            for cmplx in scraper.scrape_complexes():
+                scraper.select('court_complex_code', cmplx)
+                if SCRAPE_ESTABLISHMENTS:
+                    establishments = []
+                    for establishment in scraper.scrape_establishments():
+                        establishments.append(establishment)
+
+                    csv_writer.writerow([ state, district, cmplx ] + establishments)
+                else:
+                    csv_writer.writerow([ state, district, cmplx ])
+    except Exception as e:
+        print(f"Error scraping {state}: {e}")
+    finally:
+        scraper.driver.quit()
+
+def scrape_courts():
+    config = {}
+
+    m = ScraperCaseStatus(config)
+    m.close_modal()
+
+    csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
+    csv_writer.writerow(['State', 'District', 'Complex'])
+
+    states = m.scrape_states()
+    m.driver.close()
+
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        futures = [
+            executor.submit(scrape_state_thread, state, config, csv_writer) 
+            for state in states
+        ]
+
+        for future in as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                print(f"A thread encountered an error: {e}")
+
+    csv_writer.close()
+
+if __name__ == '__main__':
+    scrape_courts()
diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py
new file mode 100644
index 0000000..485a4b8 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/translate_to_english.py
@@ -1,0 +1,42 @@
+from tempfile import TemporaryDirectory
+ 
+import pytesseract
+from pdf2image import convert_from_path
+from PIL import Image
+
+from tinydb import TinyDB
+ 
+language = 'hin'
+ 
+def to_english(input_file, output_file):
+    image_file_list = []
+
+    with TemporaryDirectory() as tempdir:
+        pdf_pages = convert_from_path(input_file, 500)
+
+        for page_enumeration, page in enumerate(pdf_pages, start=1):
+            filename = f"{tempdir}/page_{page_enumeration}.jpg"
+            page.save(filename, "JPEG")
+            image_file_list.append(filename)
+ 
+        with open(output_file, "a") as h:
+            for image_file in image_file_list:
+                text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
+ 
+                # In many PDFs, at line ending, if a word can't
+                # be written fully, a 'hyphen' is added.
+                # The rest of the word is written in the next line
+                # Eg: This is a sample text this word here GeeksF-
+                # orGeeks is half on first line, remaining on next.
+                # To remove this, we replace every '-\n' to ''.
+                text = text.replace("-\n", "")
+
+                breakpoint()
+ 
+                h.write(text)
+
+db = TinyDB('orders.json')
+entries = db.all()
+
+for entry in entries:
+    to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')
diff --git a/test/.gitignore b/test/.gitignore
new file mode 100644
index 0000000..818a333 100644
--- /dev/null
+++ a/test/.gitignore
@@ -1,0 +1,2 @@
+*.txt
+*.mp3
diff --git a/test/transcribe.py b/test/transcribe.py
new file mode 100644
index 0000000..c64f425 100644
--- /dev/null
+++ a/test/transcribe.py
@@ -1,0 +1,14 @@
+import os
+import whisper
+
+def transcribe_audio(audio_file_path, model_path):
+    model = whisper.load_model(model_path)
+    result = model.transcribe(audio_file_path)
+    text_file_path = os.path.splitext(audio_file_path)[0] + ".txt"
+    with open(text_file_path, "w") as text_file:
+        text_file.write(result['text'])
+    
+audio_file_path = 'test.mp3'
+
+if audio_file_path is not None:
+    transcribe_audio(audio_file_path, model_path='medium')
diff --git a/scrape_ecourtindia_v6/modules/scraper.py b/scrape_ecourtindia_v6/modules/scraper.py
index 4616763..140302e 100644
--- a/scrape_ecourtindia_v6/modules/scraper.py
+++ a/scrape_ecourtindia_v6/modules/scraper.py
@@ -20,8 +20,14 @@
         sleep(1)
 
     def select(self, i_d, value):
-        sleep(1)
-        element = self.driver.find_element(By.ID, i_d)
+        while True:
+            try:
+                element = self.driver.find_element(By.ID, i_d)
+                break
+            except:
+                sleep(0.2)
+                pass
+
         select = Select(element)
         select.select_by_visible_text(value)
         sleep(1)
@@ -51,6 +57,9 @@
         print(f'COMPLEXES: {complexes}')
 
         return complexes
+
+    def establishments_visible(self):
+        return self.driver.find_element(By.ID, 'court_est_code').is_displayed()
 
     def scrape_establishments(self):
         element = self.driver.find_element(By.ID, 'court_est_code')
diff --git a/scrape_ecourtindia_v6/modules/scraper_case_status.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py
index 684d9d7..b4a9ec3 100644
--- a/scrape_ecourtindia_v6/modules/scraper_case_status.py
+++ a/scrape_ecourtindia_v6/modules/scraper_case_status.py
@@ -5,7 +5,6 @@
 from urllib import request
 
 from selenium.webdriver.common.by import By
-from selenium.webdriver.support.select import Select
 
 from bs4 import BeautifulSoup
 
@@ -13,45 +12,30 @@
 import pytesseract
 import tempfile
 
-from tinydb import TinyDB
-
 from .scraper import Scraper
 
 class ScraperCaseStatus(Scraper):
-    def __init__(self, config):
-        Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')
-
-        self.db = TinyDB('db.json')
-        self.config = config
-
-    def select_act(self):
-        self.select('actcode', self.config['act'])
+    def __init__(self):
+        Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index', headless=False)
+
+    def select_act(self, act):
+        self.select('actcode', act)
         sleep(1)
 
         # Disposed only
         self.driver.find_element(By.ID, 'radDAct').click()
         self.submit_search()
 
-    def select_court(self):
-        sleep(2)
+    def goto_acts(self):
         while True:
-            self.select('sess_state_code', self.config['state'])
-            self.select('sess_dist_code', self.config['district'])
-            self.select('court_complex_code', self.config['court_complex'])
-
-            sleep(2)
-            modal_is_open = self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed()
-            if modal_is_open:
+            try:
                 self.close_modal()
-                continue
-
-            break
-
-        self.select('court_est_code', self.config['court_establishment'])
+                element = self.driver.find_element(By.ID, 'act-tabMenu')
+                element.click()
+                break
+            except:
+                pass
 
-    def goto_acts(self):
-        element = self.driver.find_element(By.ID, 'act-tabMenu')
-        element.click()
         sleep(1)
 
     def submit_search(self):
@@ -76,9 +60,13 @@
                 element.clear()
             else:
                 captcha_incomplete = False
+
+    def handle_table(self, db):
+        try:
+            table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
+        except:
+            return
 
-    def handle_table(self):
-        table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
         self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
         self.views = []
         i = 5
@@ -109,7 +97,7 @@
 
             self.parse_orders_table()
 
-            self.db.insert(self.current_view)
+            db.insert(self.current_view)
             print(f'INSERTED: {self.current_view}')
             self.driver.find_element(By.ID, 'main_back_act').click()
             i += 4
@@ -134,7 +122,7 @@
             script = order.find_all('a')[0].get_attribute_list('onclick')[0]
             self.driver.execute_script(script)
 
-            sleep(0.7)
+            sleep(1)
             obj = self.driver.find_element(By.TAG_NAME, 'object')
             pdf_url = str(obj.get_attribute('data'))
 
@@ -153,4 +141,10 @@
             except:
                 print(f'UNABLE TO FETCH PDF: {pdf_url}')
 
-            self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
+            sleep(1)
+            while True:
+                try:
+                    self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
+                    break
+                except:
+                    pass
diff --git a/scrape_ecourtindia_v6/results/scraping_results.csv b/scrape_ecourtindia_v6/results/scraping_results.csv
new file mode 100644
index 0000000..35dff1a 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/results/scraping_results.csv
@@ -1,0 +1,1 @@
+State,District,Complex,Establishment,Records
--
rgit 0.1.5