github.com/compromyse/enfold.git

author	Raghuram Subramani <raghus2247@gmail.com>	2025-03-24 17:11:41.0 +05:30:00
committer	Raghuram Subramani <raghus2247@gmail.com>	2025-03-24 17:11:41.0 +05:30:00
commit	434252fa1831465b36e32206684e78cd698e8462 [patch]
tree	712d1601d7cfb6991a343e1015a07b8931cdf896
parent	33a320d48dddb44ec7d838ae9fdeaa44fabba342
download	434252fa1831465b36e32206684e78cd698e8462.tar.gz
upload scrape_ecourtindia_v6

Diff

 .gitignore                                 |   3 +++
 .nvim.lua                                  |   6 ++++++
 flake.nix                                  |  17 ++++++++++++++++-
 scrape_ecourtindia_v6/clean.sh             |   5 +++++
 scrape_ecourtindia_v6/main.py              |  14 ++++++++++++++
 scrape_ecourtindia_v6/requirements.txt     |   3 +++
 scrape_ecourtindia_v6/run.sh               |   3 +++
 scrape_ecourtindia_v6/scraper.py           | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 scrape_ecourtindia_v6/web.py               |  20 ++++++++++++++++++++
 scrape_ecourtindia_v6/templates/index.html |  40 ++++++++++++++++++++++++++++++++++++++++
 10 files changed, 276 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 69f86b2..eac84c6 100644
--- a/.gitignore
+++ a/.gitignore
@@ -175,3 +175,6 @@

venv/
.direnv/
scrape_ecourtindia_v6/html/
scrape_ecourtindia_v6/pdf/
scrape_ecourtindia_v6/db.json
diff --git a/.nvim.lua b/.nvim.lua
new file mode 100644
index 0000000..9bc6e9c 100644
--- /dev/null
+++ a/.nvim.lua
@@ -1,0 +1,6 @@
--- Set Up Lspconfig
local lspconfig = require('lspconfig')
local capabilities = require('cmp_nvim_lsp').default_capabilities()
lspconfig['pyright'].setup {
  capabilities = capabilities
}
diff --git a/flake.nix b/flake.nix
index 444c595..807fa45 100644
--- a/flake.nix
+++ a/flake.nix
@@ -6,7 +6,22 @@
    in {
      devShells.x86_64-linux.default = pkgs.mkShell {
        buildInputs = with pkgs; [
          python3
          (python3.withPackages (p: [
            p.selenium
            p.opencv-python
            p.pytesseract
            p.beautifulsoup4
            p.tinydb
            p.fastapi
            p.uvicorn
            p.jinja2
          ]))
          pyright

          firefox
          geckodriver

          tesseract
        ];
      };
    };
diff --git a/scrape_ecourtindia_v6/clean.sh b/scrape_ecourtindia_v6/clean.sh
new file mode 100755
index 0000000..bda1361 100755
--- /dev/null
+++ a/scrape_ecourtindia_v6/clean.sh
@@ -1,0 +1,5 @@
#!/usr/bin/env bash
rm -r html/* pdf/* db.json

mkdir html
mkdir pdf
diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/main.py
new file mode 100644
index 0000000..4794f8f 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/main.py
@@ -1,0 +1,14 @@
from scraper import Scraper
from tinydb import TinyDB
import os

db = TinyDB('db.json')

os.makedirs("html", exist_ok=True)
os.makedirs("pdf", exist_ok=True)

if __name__ == '__main__':
    m = Scraper(db)
    m.run()
    m.handle_views()
    m.driver.close()
diff --git a/scrape_ecourtindia_v6/requirements.txt b/scrape_ecourtindia_v6/requirements.txt
new file mode 100644
index 0000000..78bea83 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/requirements.txt
@@ -1,0 +1,3 @@
selenium
opencv-python
pytesseract
diff --git a/scrape_ecourtindia_v6/run.sh b/scrape_ecourtindia_v6/run.sh
new file mode 100644
index 0000000..de47eaf 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/run.sh
@@ -1,0 +1,3 @@
#!/usr/bin/env bash

uvicorn web:app --reload
diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py
new file mode 100644
index 0000000..ebe559c 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/scraper.py
@@ -1,0 +1,166 @@
from time import sleep
import os
import uuid

from urllib import request

from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select

from bs4 import BeautifulSoup

import cv2
import pytesseract
import tempfile

Karnataka = '3'
Bengaluru = '20'
CMM_Court_Complex = '1030134@2,5,10,11,12,13,14@Y'
Chief_Metropolitan = '10'

ACT = '23'

class Scraper:
    def __init__(self, db):
        self.db = db

        self.driver = Firefox()
        self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')

        self.current_view = {}

    def run(self):
        self.close_modal()
        self.goto_acts()
        self.select_act()
        self.parse_table()

    def close_modal(self):
        sleep(2)
        self.driver.execute_script('closeModel({modal_id:"validateError"})')
        sleep(1)

    def select(self, i_d, value):
        element = self.driver.find_element(By.ID, i_d)
        select = Select(element)
        select.select_by_value(value)
        sleep(1)

    def select_act(self):
        self.select('actcode', ACT)
        sleep(1)

        # Disposed only
        self.driver.find_element(By.ID, 'radDAct').click()
        self.submit_search()

    def goto_acts(self):
        self.select('sess_state_code', Karnataka)
        self.select('sess_dist_code', Bengaluru)
        self.select('court_complex_code', CMM_Court_Complex)

        sleep(1)
        self.select('court_est_code', Chief_Metropolitan )
        sleep(1)
        element = self.driver.find_element(By.ID, 'act-tabMenu')
        element.click()
        sleep(1)

    def submit_search(self):
        sleep(2)
        img = self.driver.find_element(By.ID, 'captcha_image')
        temp = tempfile.NamedTemporaryFile(suffix='.png')
        img.screenshot(temp.name)

        img = cv2.imread(temp.name)
        text = pytesseract.image_to_string(img).strip()

        element = self.driver.find_element(By.ID, 'act_captcha_code')
        element.send_keys(text)

        self.driver.execute_script('submitAct()')
        sleep(3)


    def parse_table(self):
        table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
        rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
        self.views = []
        i = 5
        while i < len(rows):
            self.views.append(rows[i])
            self.current_view = {
                'case_info': rows[i-2].get_text(strip=True),
                'petitioner_respondent': ' Vs '.join(rows[i-1].get_text(strip=True).split('Vs')),
                'htmlfile': '',
                'pdfs': []
            }

            i += 4

    def handle_views(self):
        i = 0
        for view in self.views:
            script = view.find_all('a')[0].get_attribute_list('onclick')[0]
            self.driver.execute_script(script)
            sleep(1)

            html = str(self.driver.find_element(By.ID, 'CSact').get_attribute('innerHTML'))

            while True:
                filename = f"html/{uuid.uuid4().hex}.html"
                if not os.path.exists(filename):
                    break

            self.current_view['htmlfile'] = filename
            with open(filename, "w", encoding="utf-8") as f:
                f.write(html)

            self.parse_orders_table()

            self.db.insert(self.current_view)
            self.driver.find_element(By.ID, 'main_back_act').click()

            i += 1
            if i == 10:
                break


    def parse_orders_table(self):
        try:
            table_innerhtml = self.driver.find_element(By.CLASS_NAME, 'order_table').get_attribute('innerHTML')
        except:
            return

        rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
        self.orders = []
        i = 5
        while i < len(rows):
            self.orders.append(rows[i])
            i += 3

        self.handle_orders()

    def handle_orders(self):
        for order in self.orders:
            script = order.find_all('a')[0].get_attribute_list('onclick')[0]
            self.driver.execute_script(script)

            sleep(2)
            obj = self.driver.find_element(By.TAG_NAME, 'object')
            pdf_url = str(obj.get_attribute('data'))

            while True:
                filename = f"pdf/{uuid.uuid4().hex}.pdf"
                if not os.path.exists(filename):
                    break
            self.current_view['pdfs'].append(filename)
            cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()])
            r = request.Request(pdf_url)
            r.add_header("Cookie", cookies)

            with request.urlopen(r) as response, open(filename, "wb") as file:
                file.write(response.read())

            self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
diff --git a/scrape_ecourtindia_v6/web.py b/scrape_ecourtindia_v6/web.py
new file mode 100644
index 0000000..195b81f 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/web.py
@@ -1,0 +1,20 @@
from tinydb import TinyDB

from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates

db = TinyDB('db.json')
app = FastAPI()

app.mount("/html", StaticFiles(directory="html"), name="html")
app.mount("/pdf", StaticFiles(directory="pdf"), name="pdf")

templates = Jinja2Templates(directory="templates")

@app.get("/", response_class=HTMLResponse)
async def index(request: Request):
    return templates.TemplateResponse(
            request=request, name="index.html", context={ 'views': db.all() }
    )
diff --git a/scrape_ecourtindia_v6/templates/index.html b/scrape_ecourtindia_v6/templates/index.html
new file mode 100644
index 0000000..0b01b77 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/templates/index.html
@@ -1,0 +1,40 @@
<html>
<head>
  <title>Index</title>
  <link
    rel="stylesheet"
    href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.min.css"
  >
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta name="color-scheme" content="light dark">
</head>
<body>
  <main class="container">
    <table>
      <thead>
        <tr>
          <th scope="col">Case Info</th>
          <th scope="col">Petitioner/Respondent</th>
          <th scope="col">HTML File</th>
          <th scope="col">Orders</th>
        </tr>
      </thead>
      <tbody>
        {% for view in views %}
          <tr>
            <th scope="row">{{ view.case_info }}</th>
            <td>{{ view.petitioner_respondent }}</td>
            <td><a href='{{ view.htmlfile }}'>Open</a></td>
            <td>
              {% for pdf in view.pdfs %}
                <a href='{{ pdf }}'>Open</a>
              {% endfor %}
            </td>
          </tr>
        {% endfor %}
      </tbody>
    </table>
  </main>
</body>
</html>