upload scrape_ecourtindia_v6
Diff
.gitignore | 3 +++
.nvim.lua | 6 ++++++
flake.nix | 17 ++++++++++++++++-
scrape_ecourtindia_v6/clean.sh | 5 +++++
scrape_ecourtindia_v6/main.py | 14 ++++++++++++++
scrape_ecourtindia_v6/requirements.txt | 3 +++
scrape_ecourtindia_v6/run.sh | 3 +++
scrape_ecourtindia_v6/scraper.py | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
scrape_ecourtindia_v6/web.py | 20 ++++++++++++++++++++
scrape_ecourtindia_v6/templates/index.html | 40 ++++++++++++++++++++++++++++++++++++++++
10 files changed, 276 insertions(+), 1 deletion(-)
@@ -175,3 +175,6 @@
venv/
.direnv/
scrape_ecourtindia_v6/html/
scrape_ecourtindia_v6/pdf/
scrape_ecourtindia_v6/db.json
@@ -1,0 +1,6 @@
local lspconfig = require('lspconfig')
local capabilities = require('cmp_nvim_lsp').default_capabilities()
lspconfig['pyright'].setup {
capabilities = capabilities
}
@@ -6,7 +6,22 @@
in {
devShells.x86_64-linux.default = pkgs.mkShell {
buildInputs = with pkgs; [
python3
(python3.withPackages (p: [
p.selenium
p.opencv-python
p.pytesseract
p.beautifulsoup4
p.tinydb
p.fastapi
p.uvicorn
p.jinja2
]))
pyright
firefox
geckodriver
tesseract
];
};
};
@@ -1,0 +1,5 @@
rm -r html/* pdf/* db.json
mkdir html
mkdir pdf
@@ -1,0 +1,14 @@
from scraper import Scraper
from tinydb import TinyDB
import os
db = TinyDB('db.json')
os.makedirs("html", exist_ok=True)
os.makedirs("pdf", exist_ok=True)
if __name__ == '__main__':
m = Scraper(db)
m.run()
m.handle_views()
m.driver.close()
@@ -1,0 +1,3 @@
selenium
opencv-python
pytesseract
@@ -1,0 +1,3 @@
uvicorn web:app --reload
@@ -1,0 +1,166 @@
from time import sleep
import os
import uuid
from urllib import request
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import cv2
import pytesseract
import tempfile
Karnataka = '3'
Bengaluru = '20'
CMM_Court_Complex = '1030134@2,5,10,11,12,13,14@Y'
Chief_Metropolitan = '10'
ACT = '23'
class Scraper:
def __init__(self, db):
self.db = db
self.driver = Firefox()
self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')
self.current_view = {}
def run(self):
self.close_modal()
self.goto_acts()
self.select_act()
self.parse_table()
def close_modal(self):
sleep(2)
self.driver.execute_script('closeModel({modal_id:"validateError"})')
sleep(1)
def select(self, i_d, value):
element = self.driver.find_element(By.ID, i_d)
select = Select(element)
select.select_by_value(value)
sleep(1)
def select_act(self):
self.select('actcode', ACT)
sleep(1)
self.driver.find_element(By.ID, 'radDAct').click()
self.submit_search()
def goto_acts(self):
self.select('sess_state_code', Karnataka)
self.select('sess_dist_code', Bengaluru)
self.select('court_complex_code', CMM_Court_Complex)
sleep(1)
self.select('court_est_code', Chief_Metropolitan )
sleep(1)
element = self.driver.find_element(By.ID, 'act-tabMenu')
element.click()
sleep(1)
def submit_search(self):
sleep(2)
img = self.driver.find_element(By.ID, 'captcha_image')
temp = tempfile.NamedTemporaryFile(suffix='.png')
img.screenshot(temp.name)
img = cv2.imread(temp.name)
text = pytesseract.image_to_string(img).strip()
element = self.driver.find_element(By.ID, 'act_captcha_code')
element.send_keys(text)
self.driver.execute_script('submitAct()')
sleep(3)
def parse_table(self):
table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
self.views = []
i = 5
while i < len(rows):
self.views.append(rows[i])
self.current_view = {
'case_info': rows[i-2].get_text(strip=True),
'petitioner_respondent': ' Vs '.join(rows[i-1].get_text(strip=True).split('Vs')),
'htmlfile': '',
'pdfs': []
}
i += 4
def handle_views(self):
i = 0
for view in self.views:
script = view.find_all('a')[0].get_attribute_list('onclick')[0]
self.driver.execute_script(script)
sleep(1)
html = str(self.driver.find_element(By.ID, 'CSact').get_attribute('innerHTML'))
while True:
filename = f"html/{uuid.uuid4().hex}.html"
if not os.path.exists(filename):
break
self.current_view['htmlfile'] = filename
with open(filename, "w", encoding="utf-8") as f:
f.write(html)
self.parse_orders_table()
self.db.insert(self.current_view)
self.driver.find_element(By.ID, 'main_back_act').click()
i += 1
if i == 10:
break
def parse_orders_table(self):
try:
table_innerhtml = self.driver.find_element(By.CLASS_NAME, 'order_table').get_attribute('innerHTML')
except:
return
rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
self.orders = []
i = 5
while i < len(rows):
self.orders.append(rows[i])
i += 3
self.handle_orders()
def handle_orders(self):
for order in self.orders:
script = order.find_all('a')[0].get_attribute_list('onclick')[0]
self.driver.execute_script(script)
sleep(2)
obj = self.driver.find_element(By.TAG_NAME, 'object')
pdf_url = str(obj.get_attribute('data'))
while True:
filename = f"pdf/{uuid.uuid4().hex}.pdf"
if not os.path.exists(filename):
break
self.current_view['pdfs'].append(filename)
cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()])
r = request.Request(pdf_url)
r.add_header("Cookie", cookies)
with request.urlopen(r) as response, open(filename, "wb") as file:
file.write(response.read())
self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
@@ -1,0 +1,20 @@
from tinydb import TinyDB
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
db = TinyDB('db.json')
app = FastAPI()
app.mount("/html", StaticFiles(directory="html"), name="html")
app.mount("/pdf", StaticFiles(directory="pdf"), name="pdf")
templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse)
async def index(request: Request):
return templates.TemplateResponse(
request=request, name="index.html", context={ 'views': db.all() }
)
@@ -1,0 +1,40 @@
<html>
<head>
<title>Index</title>
<link
rel="stylesheet"
href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.min.css"
>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="color-scheme" content="light dark">
</head>
<body>
<main class="container">
<table>
<thead>
<tr>
<th scope="col">Case Info</th>
<th scope="col">Petitioner/Respondent</th>
<th scope="col">HTML File</th>
<th scope="col">Orders</th>
</tr>
</thead>
<tbody>
{% for view in views %}
<tr>
<th scope="row">{{ view.case_info }}</th>
<td>{{ view.petitioner_respondent }}</td>
<td><a href='{{ view.htmlfile }}'>Open</a></td>
<td>
{% for pdf in view.pdfs %}
<a href='{{ pdf }}'>Open</a>
{% endfor %}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</main>
</body>
</html>