🏡 index : github.com/compromyse/enfold.git

author Raghuram Subramani <raghus2247@gmail.com> 2025-03-31 14:30:38.0 +05:30:00
committer Raghuram Subramani <raghus2247@gmail.com> 2025-03-31 14:30:57.0 +05:30:00
commit
0f188ea1e638e6abddb03d49b9209c703081b2fe [patch]
tree
cfe69bb82158fccf9eb4d5737d0c9c1603c5e1f1
parent
97d1df0cd10f9f4adc1991cc8067cc8f1d3978cf
download
0f188ea1e638e6abddb03d49b9209c703081b2fe.tar.gz

update



Diff

 flake.nix                                            |  50 +++++++++++++++++++++++++++++++-------------------
 scrape_ecourtindia_v6/.gitignore                     |   6 ++++--
 scrape_ecourtindia_v6/scrape_case_status.py          | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
 scrape_ecourtindia_v6/scrape_case_status_states.py   |  70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 scrape_ecourtindia_v6/translate_to_english.py        |  42 ++++++++++++++++++++++++++++++++++++++++++
 test/.gitignore                                      |   2 ++
 test/transcribe.py                                   |  14 ++++++++++++++
 scrape_ecourtindia_v6/modules/scraper.py             |  13 +++++++++++--
 scrape_ecourtindia_v6/modules/scraper_case_status.py |  60 +++++++++++++++++++++++++++++++-----------------------------
 scrape_ecourtindia_v6/results/scraping_results.csv   |   1 +
 10 files changed, 259 insertions(+), 141 deletions(-)

diff --git a/flake.nix b/flake.nix
index 807fa45..93bca92 100644
--- a/flake.nix
+++ a/flake.nix
@@ -1,28 +1,34 @@
{
  inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";

  outputs = { self, nixpkgs, ... }: let
      pkgs = import nixpkgs { system = "x86_64-linux"; config.allowUnfree = true; };
    in {
      devShells.x86_64-linux.default = pkgs.mkShell {
        buildInputs = with pkgs; [
          (python3.withPackages (p: [
            p.selenium
            p.opencv-python
            p.pytesseract
            p.beautifulsoup4
            p.tinydb
            p.fastapi
            p.uvicorn
            p.jinja2
          ]))
          pyright

          firefox
          geckodriver

          tesseract
        ];
      };
    system = "x86_64-linux";
    pkgs = import nixpkgs { inherit system; config.allowUnfree = true; };
  in {
    devShells.${system}.default = pkgs.mkShell {
      buildInputs = with pkgs; [
        (python3.withPackages (p: [
          p.selenium
          p.opencv-python
          p.pytesseract
          p.beautifulsoup4
          p.tinydb
          p.fastapi
          p.uvicorn
          p.jinja2

          # p.pdf2image
          # p.openai-whisper
          # p.torch-bin
        ]))

        pyright

        firefox
        geckodriver

        tesseract
      ];
    };
  };
}
diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore
index f32422f..1aed0d4 100644
--- a/scrape_ecourtindia_v6/.gitignore
+++ a/scrape_ecourtindia_v6/.gitignore
@@ -1,6 +1,8 @@
courts.csv
*.csv
csv/*
named_pdf/*
pdf/*
html/*
orders.json
bak/
translated/*
*.json
diff --git a/scrape_ecourtindia_v6/scrape_case_status.py b/scrape_ecourtindia_v6/scrape_case_status.py
index 2b543ba..a8891fd 100644
--- a/scrape_ecourtindia_v6/scrape_case_status.py
+++ a/scrape_ecourtindia_v6/scrape_case_status.py
@@ -1,89 +1,67 @@
import csv
from time import sleep
from modules.scraper_case_status import ScraperCaseStatus
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

SCRAPE_ESTABLISHMENTS = True

class ThreadSafeCSVWriter:
    def __init__(self, filename):
        self.file = open(filename, 'w', newline='')
        self.writer = csv.writer(self.file)
        self.lock = threading.Lock()

    def writerow(self, row):
        with self.lock:
            self.writer.writerow(row)

    def close(self):
        self.file.close()

def scrape_state_thread(state, config, csv_writer):
    scraper = ScraperCaseStatus(config)
    scraper.close_modal()
    try:
        scraper.select('sess_state_code', state)
        for district in scraper.scrape_districts():
            scraper.select('sess_dist_code', district)
            for cmplx in scraper.scrape_complexes():
                scraper.select('court_complex_code', cmplx)
                if SCRAPE_ESTABLISHMENTS:
                    establishments = []
                    for establishment in scraper.scrape_establishments():
                        establishments.append(establishment)

                    csv_writer.writerow([ state, district, cmplx ] + establishments)
                else:
                    csv_writer.writerow([ state, district, cmplx ])
    except Exception as e:
        print(f"Error scraping {state}: {e}")
    finally:
        scraper.driver.quit()

def scrape_courts():
    config = {}

    m = ScraperCaseStatus(config)
    m.close_modal()

    csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
    csv_writer.writerow(['State', 'District', 'Complex'])

    states = m.scrape_states()
    m.driver.close()

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [
            executor.submit(scrape_state_thread, state, config, csv_writer) 
            for state in states
        ]

        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"A thread encountered an error: {e}")

    csv_writer.close()
from tinydb import TinyDB

def scrape_orders():
    config = {}
db = TinyDB('db.json')

    m = ScraperCaseStatus(config)
    m.close_modal()
scraper = ScraperCaseStatus()

    config['state'] = input('Select a state: ')
    config['district'] = input('Select a district: ')
    config['court_complex'] = input('Select a court complex: ')
    config['court_establishment'] = input('Select a court establishment: ')
    config['act'] = input('Select an act: ')
state = 'Karnataka'
act = 'Juvenile Justice (Care and Protection of Children) Act, 2015'

    m.select_court()
    m.goto_acts()
    m.select_act()
    m.handle_table()
scraper.close_modal()
scraper.select('sess_state_code', state)
sleep(1)

    m.driver.close()
for district in scraper.scrape_districts():
    print(f'SELECTING DISTRICT {district}')
    while True:
        try:
            scraper.close_modal()
            scraper.select('sess_dist_code', district)
            break
        except:
            pass
    sleep(1)

    for cmplx in scraper.scrape_complexes():
        sleep(1)
        print(f'SELECTING COMPLEX {cmplx}')
        while True:
            try:
                scraper.close_modal()
                scraper.select('court_complex_code', cmplx)
                break
            except:
                pass
        try:
            scraper.driver.switch_to.alert.accept();
            scraper.close_modal()
        except:
            pass

        for establishment in scraper.scrape_establishments():
            sleep(1)
            print(f'SELECTING ESTABLISHMENT {establishment}')
            while True:
                try:
                    scraper.close_modal()
                    scraper.select('court_est_code', establishment)
                    break
                except Exception as e:
                    print("EXCEPTION HANDLED:")
                    print(e)

            sleep(1)
            scraper.close_modal()

            sleep(1)
            scraper.goto_acts()
            try:
                scraper.select_act(act)
                scraper.handle_table(db)
            except Exception as e:
                    print("EXCEPTION HANDLED:")
                    print(e)

if __name__ == '__main__':
    scrape_courts()
scraper.driver.close()
diff --git a/scrape_ecourtindia_v6/scrape_case_status_states.py b/scrape_ecourtindia_v6/scrape_case_status_states.py
new file mode 100644
index 0000000..e75af84 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/scrape_case_status_states.py
@@ -1,0 +1,70 @@
import csv
from modules.scraper_case_status import ScraperCaseStatus
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

SCRAPE_ESTABLISHMENTS = True

class ThreadSafeCSVWriter:
    def __init__(self, filename):
        self.file = open(filename, 'w', newline='')
        self.writer = csv.writer(self.file)
        self.lock = threading.Lock()

    def writerow(self, row):
        with self.lock:
            self.writer.writerow(row)

    def close(self):
        self.file.close()

def scrape_state_thread(state, config, csv_writer):
    scraper = ScraperCaseStatus(config)
    scraper.close_modal()
    try:
        scraper.select('sess_state_code', state)
        for district in scraper.scrape_districts():
            scraper.select('sess_dist_code', district)
            for cmplx in scraper.scrape_complexes():
                scraper.select('court_complex_code', cmplx)
                if SCRAPE_ESTABLISHMENTS:
                    establishments = []
                    for establishment in scraper.scrape_establishments():
                        establishments.append(establishment)

                    csv_writer.writerow([ state, district, cmplx ] + establishments)
                else:
                    csv_writer.writerow([ state, district, cmplx ])
    except Exception as e:
        print(f"Error scraping {state}: {e}")
    finally:
        scraper.driver.quit()

def scrape_courts():
    config = {}

    m = ScraperCaseStatus(config)
    m.close_modal()

    csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
    csv_writer.writerow(['State', 'District', 'Complex'])

    states = m.scrape_states()
    m.driver.close()

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [
            executor.submit(scrape_state_thread, state, config, csv_writer) 
            for state in states
        ]

        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"A thread encountered an error: {e}")

    csv_writer.close()

if __name__ == '__main__':
    scrape_courts()
diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py
new file mode 100644
index 0000000..485a4b8 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/translate_to_english.py
@@ -1,0 +1,42 @@
from tempfile import TemporaryDirectory
 
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

from tinydb import TinyDB
 
language = 'hin'
 
def to_english(input_file, output_file):
    image_file_list = []

    with TemporaryDirectory() as tempdir:
        pdf_pages = convert_from_path(input_file, 500)

        for page_enumeration, page in enumerate(pdf_pages, start=1):
            filename = f"{tempdir}/page_{page_enumeration}.jpg"
            page.save(filename, "JPEG")
            image_file_list.append(filename)
 
        with open(output_file, "a") as h:
            for image_file in image_file_list:
                text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
 
                # In many PDFs, at line ending, if a word can't
                # be written fully, a 'hyphen' is added.
                # The rest of the word is written in the next line
                # Eg: This is a sample text this word here GeeksF-
                # orGeeks is half on first line, remaining on next.
                # To remove this, we replace every '-\n' to ''.
                text = text.replace("-\n", "")

                breakpoint()
 
                h.write(text)

db = TinyDB('orders.json')
entries = db.all()

for entry in entries:
    to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')
diff --git a/test/.gitignore b/test/.gitignore
new file mode 100644
index 0000000..818a333 100644
--- /dev/null
+++ a/test/.gitignore
@@ -1,0 +1,2 @@
*.txt
*.mp3
diff --git a/test/transcribe.py b/test/transcribe.py
new file mode 100644
index 0000000..c64f425 100644
--- /dev/null
+++ a/test/transcribe.py
@@ -1,0 +1,14 @@
import os
import whisper

def transcribe_audio(audio_file_path, model_path):
    model = whisper.load_model(model_path)
    result = model.transcribe(audio_file_path)
    text_file_path = os.path.splitext(audio_file_path)[0] + ".txt"
    with open(text_file_path, "w") as text_file:
        text_file.write(result['text'])
    
audio_file_path = 'test.mp3'

if audio_file_path is not None:
    transcribe_audio(audio_file_path, model_path='medium')
diff --git a/scrape_ecourtindia_v6/modules/scraper.py b/scrape_ecourtindia_v6/modules/scraper.py
index 4616763..140302e 100644
--- a/scrape_ecourtindia_v6/modules/scraper.py
+++ a/scrape_ecourtindia_v6/modules/scraper.py
@@ -20,8 +20,14 @@
        sleep(1)

    def select(self, i_d, value):
        sleep(1)
        element = self.driver.find_element(By.ID, i_d)
        while True:
            try:
                element = self.driver.find_element(By.ID, i_d)
                break
            except:
                sleep(0.2)
                pass

        select = Select(element)
        select.select_by_visible_text(value)
        sleep(1)
@@ -51,6 +57,9 @@
        print(f'COMPLEXES: {complexes}')

        return complexes

    def establishments_visible(self):
        return self.driver.find_element(By.ID, 'court_est_code').is_displayed()

    def scrape_establishments(self):
        element = self.driver.find_element(By.ID, 'court_est_code')
diff --git a/scrape_ecourtindia_v6/modules/scraper_case_status.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py
index 684d9d7..b4a9ec3 100644
--- a/scrape_ecourtindia_v6/modules/scraper_case_status.py
+++ a/scrape_ecourtindia_v6/modules/scraper_case_status.py
@@ -5,7 +5,6 @@
from urllib import request

from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select

from bs4 import BeautifulSoup

@@ -13,45 +12,30 @@
import pytesseract
import tempfile

from tinydb import TinyDB

from .scraper import Scraper

class ScraperCaseStatus(Scraper):
    def __init__(self, config):
        Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')

        self.db = TinyDB('db.json')
        self.config = config

    def select_act(self):
        self.select('actcode', self.config['act'])
    def __init__(self):
        Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index', headless=False)

    def select_act(self, act):
        self.select('actcode', act)
        sleep(1)

        # Disposed only
        self.driver.find_element(By.ID, 'radDAct').click()
        self.submit_search()

    def select_court(self):
        sleep(2)
    def goto_acts(self):
        while True:
            self.select('sess_state_code', self.config['state'])
            self.select('sess_dist_code', self.config['district'])
            self.select('court_complex_code', self.config['court_complex'])

            sleep(2)
            modal_is_open = self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed()
            if modal_is_open:
            try:
                self.close_modal()
                continue

            break

        self.select('court_est_code', self.config['court_establishment'])
                element = self.driver.find_element(By.ID, 'act-tabMenu')
                element.click()
                break
            except:
                pass

    def goto_acts(self):
        element = self.driver.find_element(By.ID, 'act-tabMenu')
        element.click()
        sleep(1)

    def submit_search(self):
@@ -76,9 +60,13 @@
                element.clear()
            else:
                captcha_incomplete = False

    def handle_table(self, db):
        try:
            table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
        except:
            return

    def handle_table(self):
        table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
        self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
        self.views = []
        i = 5
@@ -109,7 +97,7 @@

            self.parse_orders_table()

            self.db.insert(self.current_view)
            db.insert(self.current_view)
            print(f'INSERTED: {self.current_view}')
            self.driver.find_element(By.ID, 'main_back_act').click()
            i += 4
@@ -134,7 +122,7 @@
            script = order.find_all('a')[0].get_attribute_list('onclick')[0]
            self.driver.execute_script(script)

            sleep(0.7)
            sleep(1)
            obj = self.driver.find_element(By.TAG_NAME, 'object')
            pdf_url = str(obj.get_attribute('data'))

@@ -153,4 +141,10 @@
            except:
                print(f'UNABLE TO FETCH PDF: {pdf_url}')

            self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
            sleep(1)
            while True:
                try:
                    self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
                    break
                except:
                    pass
diff --git a/scrape_ecourtindia_v6/results/scraping_results.csv b/scrape_ecourtindia_v6/results/scraping_results.csv
new file mode 100644
index 0000000..35dff1a 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/results/scraping_results.csv
@@ -1,0 +1,1 @@
State,District,Complex,Establishment,Records