From 434252fa1831465b36e32206684e78cd698e8462 Mon Sep 17 00:00:00 2001 From: Raghuram Subramani <raghus2247@gmail.com> Date: Mon, 24 Mar 2025 17:11:41 +0530 Subject: [PATCH] upload scrape_ecourtindia_v6 --- .gitignore | 3 +++ .nvim.lua | 6 ++++++ flake.nix | 17 ++++++++++++++++- scrape_ecourtindia_v6/clean.sh | 5 +++++ scrape_ecourtindia_v6/main.py | 14 ++++++++++++++ scrape_ecourtindia_v6/requirements.txt | 3 +++ scrape_ecourtindia_v6/run.sh | 3 +++ scrape_ecourtindia_v6/scraper.py | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scrape_ecourtindia_v6/web.py | 20 ++++++++++++++++++++ scrape_ecourtindia_v6/templates/index.html | 40 ++++++++++++++++++++++++++++++++++++++++ 10 files changed, 276 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 69f86b2..eac84c6 100644 --- a/.gitignore +++ a/.gitignore @@ -175,3 +175,6 @@ venv/ .direnv/ +scrape_ecourtindia_v6/html/ +scrape_ecourtindia_v6/pdf/ +scrape_ecourtindia_v6/db.json diff --git a/.nvim.lua b/.nvim.lua new file mode 100644 index 0000000..9bc6e9c 100644 --- /dev/null +++ a/.nvim.lua @@ -1,0 +1,6 @@ +--- Set Up Lspconfig +local lspconfig = require('lspconfig') +local capabilities = require('cmp_nvim_lsp').default_capabilities() +lspconfig['pyright'].setup { + capabilities = capabilities +} diff --git a/flake.nix b/flake.nix index 444c595..807fa45 100644 --- a/flake.nix +++ a/flake.nix @@ -6,7 +6,22 @@ in { devShells.x86_64-linux.default = pkgs.mkShell { buildInputs = with pkgs; [ - python3 + (python3.withPackages (p: [ + p.selenium + p.opencv-python + p.pytesseract + p.beautifulsoup4 + p.tinydb + p.fastapi + p.uvicorn + p.jinja2 + ])) + pyright + + firefox + geckodriver + + tesseract ]; }; }; diff --git a/scrape_ecourtindia_v6/clean.sh b/scrape_ecourtindia_v6/clean.sh new file mode 100755 index 0000000..bda1361 100755 --- /dev/null +++ a/scrape_ecourtindia_v6/clean.sh @@ -1,0 +1,5 @@ +#!/usr/bin/env bash +rm -r html/* pdf/* db.json + +mkdir html +mkdir pdf diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/main.py new file mode 100644 index 0000000..4794f8f 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/main.py @@ -1,0 +1,14 @@ +from scraper import Scraper +from tinydb import TinyDB +import os + +db = TinyDB('db.json') + +os.makedirs("html", exist_ok=True) +os.makedirs("pdf", exist_ok=True) + +if __name__ == '__main__': + m = Scraper(db) + m.run() + m.handle_views() + m.driver.close() diff --git a/scrape_ecourtindia_v6/requirements.txt b/scrape_ecourtindia_v6/requirements.txt new file mode 100644 index 0000000..78bea83 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/requirements.txt @@ -1,0 +1,3 @@ +selenium +opencv-python +pytesseract diff --git a/scrape_ecourtindia_v6/run.sh b/scrape_ecourtindia_v6/run.sh new file mode 100644 index 0000000..de47eaf 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/run.sh @@ -1,0 +1,3 @@ +#!/usr/bin/env bash + +uvicorn web:app --reload diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py new file mode 100644 index 0000000..ebe559c 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/scraper.py @@ -1,0 +1,166 @@ +from time import sleep +import os +import uuid + +from urllib import request + +from selenium.webdriver import Firefox +from selenium.webdriver.common.by import By +from selenium.webdriver.support.select import Select + +from bs4 import BeautifulSoup + +import cv2 +import pytesseract +import tempfile + +Karnataka = '3' +Bengaluru = '20' +CMM_Court_Complex = '1030134@2,5,10,11,12,13,14@Y' +Chief_Metropolitan = '10' + +ACT = '23' + +class Scraper: + def __init__(self, db): + self.db = db + + self.driver = Firefox() + self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index') + + self.current_view = {} + + def run(self): + self.close_modal() + self.goto_acts() + self.select_act() + self.parse_table() + + def close_modal(self): + sleep(2) + self.driver.execute_script('closeModel({modal_id:"validateError"})') + sleep(1) + + def select(self, i_d, value): + element = self.driver.find_element(By.ID, i_d) + select = Select(element) + select.select_by_value(value) + sleep(1) + + def select_act(self): + self.select('actcode', ACT) + sleep(1) + + # Disposed only + self.driver.find_element(By.ID, 'radDAct').click() + self.submit_search() + + def goto_acts(self): + self.select('sess_state_code', Karnataka) + self.select('sess_dist_code', Bengaluru) + self.select('court_complex_code', CMM_Court_Complex) + + sleep(1) + self.select('court_est_code', Chief_Metropolitan ) + sleep(1) + element = self.driver.find_element(By.ID, 'act-tabMenu') + element.click() + sleep(1) + + def submit_search(self): + sleep(2) + img = self.driver.find_element(By.ID, 'captcha_image') + temp = tempfile.NamedTemporaryFile(suffix='.png') + img.screenshot(temp.name) + + img = cv2.imread(temp.name) + text = pytesseract.image_to_string(img).strip() + + element = self.driver.find_element(By.ID, 'act_captcha_code') + element.send_keys(text) + + self.driver.execute_script('submitAct()') + sleep(3) + + + def parse_table(self): + table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') + rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') + self.views = [] + i = 5 + while i < len(rows): + self.views.append(rows[i]) + self.current_view = { + 'case_info': rows[i-2].get_text(strip=True), + 'petitioner_respondent': ' Vs '.join(rows[i-1].get_text(strip=True).split('Vs')), + 'htmlfile': '', + 'pdfs': [] + } + + i += 4 + + def handle_views(self): + i = 0 + for view in self.views: + script = view.find_all('a')[0].get_attribute_list('onclick')[0] + self.driver.execute_script(script) + sleep(1) + + html = str(self.driver.find_element(By.ID, 'CSact').get_attribute('innerHTML')) + + while True: + filename = f"html/{uuid.uuid4().hex}.html" + if not os.path.exists(filename): + break + + self.current_view['htmlfile'] = filename + with open(filename, "w", encoding="utf-8") as f: + f.write(html) + + self.parse_orders_table() + + self.db.insert(self.current_view) + self.driver.find_element(By.ID, 'main_back_act').click() + + i += 1 + if i == 10: + break + + + def parse_orders_table(self): + try: + table_innerhtml = self.driver.find_element(By.CLASS_NAME, 'order_table').get_attribute('innerHTML') + except: + return + + rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') + self.orders = [] + i = 5 + while i < len(rows): + self.orders.append(rows[i]) + i += 3 + + self.handle_orders() + + def handle_orders(self): + for order in self.orders: + script = order.find_all('a')[0].get_attribute_list('onclick')[0] + self.driver.execute_script(script) + + sleep(2) + obj = self.driver.find_element(By.TAG_NAME, 'object') + pdf_url = str(obj.get_attribute('data')) + + while True: + filename = f"pdf/{uuid.uuid4().hex}.pdf" + if not os.path.exists(filename): + break + self.current_view['pdfs'].append(filename) + cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()]) + r = request.Request(pdf_url) + r.add_header("Cookie", cookies) + + with request.urlopen(r) as response, open(filename, "wb") as file: + file.write(response.read()) + + self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() diff --git a/scrape_ecourtindia_v6/web.py b/scrape_ecourtindia_v6/web.py new file mode 100644 index 0000000..195b81f 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/web.py @@ -1,0 +1,20 @@ +from tinydb import TinyDB + +from fastapi import FastAPI, Request +from fastapi.responses import HTMLResponse +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates + +db = TinyDB('db.json') +app = FastAPI() + +app.mount("/html", StaticFiles(directory="html"), name="html") +app.mount("/pdf", StaticFiles(directory="pdf"), name="pdf") + +templates = Jinja2Templates(directory="templates") + +@app.get("/", response_class=HTMLResponse) +async def index(request: Request): + return templates.TemplateResponse( + request=request, name="index.html", context={ 'views': db.all() } + ) diff --git a/scrape_ecourtindia_v6/templates/index.html b/scrape_ecourtindia_v6/templates/index.html new file mode 100644 index 0000000..0b01b77 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/templates/index.html @@ -1,0 +1,40 @@ +<html> +<head> + <title>Index</title> + <link + rel="stylesheet" + href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.min.css" + > + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <meta name="color-scheme" content="light dark"> +</head> +<body> + <main class="container"> + <table> + <thead> + <tr> + <th scope="col">Case Info</th> + <th scope="col">Petitioner/Respondent</th> + <th scope="col">HTML File</th> + <th scope="col">Orders</th> + </tr> + </thead> + <tbody> + {% for view in views %} + <tr> + <th scope="row">{{ view.case_info }}</th> + <td>{{ view.petitioner_respondent }}</td> + <td><a href='{{ view.htmlfile }}'>Open</a></td> + <td> + {% for pdf in view.pdfs %} + <a href='{{ pdf }}'>Open</a> + {% endfor %} + </td> + </tr> + {% endfor %} + </tbody> + </table> + </main> +</body> +</html> -- rgit 0.1.5