update
Diff
flake.nix | 50 +++++++++++++++++++++++++++++++-------------------
scrape_ecourtindia_v6/.gitignore | 6 ++++--
scrape_ecourtindia_v6/scrape_case_status.py | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
scrape_ecourtindia_v6/scrape_case_status_states.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
scrape_ecourtindia_v6/translate_to_english.py | 42 ++++++++++++++++++++++++++++++++++++++++++
test/.gitignore | 2 ++
test/transcribe.py | 14 ++++++++++++++
scrape_ecourtindia_v6/modules/scraper.py | 13 +++++++++++--
scrape_ecourtindia_v6/modules/scraper_case_status.py | 60 +++++++++++++++++++++++++++++++-----------------------------
scrape_ecourtindia_v6/results/scraping_results.csv | 1 +
10 files changed, 259 insertions(+), 141 deletions(-)
@@ -1,28 +1,34 @@
{
inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
outputs = { self, nixpkgs, ... }: let
pkgs = import nixpkgs { system = "x86_64-linux"; config.allowUnfree = true; };
in {
devShells.x86_64-linux.default = pkgs.mkShell {
buildInputs = with pkgs; [
(python3.withPackages (p: [
p.selenium
p.opencv-python
p.pytesseract
p.beautifulsoup4
p.tinydb
p.fastapi
p.uvicorn
p.jinja2
]))
pyright
firefox
geckodriver
tesseract
];
};
system = "x86_64-linux";
pkgs = import nixpkgs { inherit system; config.allowUnfree = true; };
in {
devShells.${system}.default = pkgs.mkShell {
buildInputs = with pkgs; [
(python3.withPackages (p: [
p.selenium
p.opencv-python
p.pytesseract
p.beautifulsoup4
p.tinydb
p.fastapi
p.uvicorn
p.jinja2
]))
pyright
firefox
geckodriver
tesseract
];
};
};
}
@@ -1,6 +1,8 @@
courts.csv
*.csv
csv/*
named_pdf/*
pdf/*
html/*
orders.json
bak/
translated/*
*.json
@@ -1,89 +1,67 @@
import csv
from time import sleep
from modules.scraper_case_status import ScraperCaseStatus
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
SCRAPE_ESTABLISHMENTS = True
class ThreadSafeCSVWriter:
def __init__(self, filename):
self.file = open(filename, 'w', newline='')
self.writer = csv.writer(self.file)
self.lock = threading.Lock()
def writerow(self, row):
with self.lock:
self.writer.writerow(row)
def close(self):
self.file.close()
def scrape_state_thread(state, config, csv_writer):
scraper = ScraperCaseStatus(config)
scraper.close_modal()
try:
scraper.select('sess_state_code', state)
for district in scraper.scrape_districts():
scraper.select('sess_dist_code', district)
for cmplx in scraper.scrape_complexes():
scraper.select('court_complex_code', cmplx)
if SCRAPE_ESTABLISHMENTS:
establishments = []
for establishment in scraper.scrape_establishments():
establishments.append(establishment)
csv_writer.writerow([ state, district, cmplx ] + establishments)
else:
csv_writer.writerow([ state, district, cmplx ])
except Exception as e:
print(f"Error scraping {state}: {e}")
finally:
scraper.driver.quit()
def scrape_courts():
config = {}
m = ScraperCaseStatus(config)
m.close_modal()
csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
csv_writer.writerow(['State', 'District', 'Complex'])
states = m.scrape_states()
m.driver.close()
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
executor.submit(scrape_state_thread, state, config, csv_writer)
for state in states
]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
print(f"A thread encountered an error: {e}")
csv_writer.close()
from tinydb import TinyDB
def scrape_orders():
config = {}
db = TinyDB('db.json')
m = ScraperCaseStatus(config)
m.close_modal()
scraper = ScraperCaseStatus()
config['state'] = input('Select a state: ')
config['district'] = input('Select a district: ')
config['court_complex'] = input('Select a court complex: ')
config['court_establishment'] = input('Select a court establishment: ')
config['act'] = input('Select an act: ')
state = 'Karnataka'
act = 'Juvenile Justice (Care and Protection of Children) Act, 2015'
m.select_court()
m.goto_acts()
m.select_act()
m.handle_table()
scraper.close_modal()
scraper.select('sess_state_code', state)
sleep(1)
m.driver.close()
for district in scraper.scrape_districts():
print(f'SELECTING DISTRICT {district}')
while True:
try:
scraper.close_modal()
scraper.select('sess_dist_code', district)
break
except:
pass
sleep(1)
for cmplx in scraper.scrape_complexes():
sleep(1)
print(f'SELECTING COMPLEX {cmplx}')
while True:
try:
scraper.close_modal()
scraper.select('court_complex_code', cmplx)
break
except:
pass
try:
scraper.driver.switch_to.alert.accept();
scraper.close_modal()
except:
pass
for establishment in scraper.scrape_establishments():
sleep(1)
print(f'SELECTING ESTABLISHMENT {establishment}')
while True:
try:
scraper.close_modal()
scraper.select('court_est_code', establishment)
break
except Exception as e:
print("EXCEPTION HANDLED:")
print(e)
sleep(1)
scraper.close_modal()
sleep(1)
scraper.goto_acts()
try:
scraper.select_act(act)
scraper.handle_table(db)
except Exception as e:
print("EXCEPTION HANDLED:")
print(e)
if __name__ == '__main__':
scrape_courts()
scraper.driver.close()
@@ -1,0 +1,70 @@
import csv
from modules.scraper_case_status import ScraperCaseStatus
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
SCRAPE_ESTABLISHMENTS = True
class ThreadSafeCSVWriter:
def __init__(self, filename):
self.file = open(filename, 'w', newline='')
self.writer = csv.writer(self.file)
self.lock = threading.Lock()
def writerow(self, row):
with self.lock:
self.writer.writerow(row)
def close(self):
self.file.close()
def scrape_state_thread(state, config, csv_writer):
scraper = ScraperCaseStatus(config)
scraper.close_modal()
try:
scraper.select('sess_state_code', state)
for district in scraper.scrape_districts():
scraper.select('sess_dist_code', district)
for cmplx in scraper.scrape_complexes():
scraper.select('court_complex_code', cmplx)
if SCRAPE_ESTABLISHMENTS:
establishments = []
for establishment in scraper.scrape_establishments():
establishments.append(establishment)
csv_writer.writerow([ state, district, cmplx ] + establishments)
else:
csv_writer.writerow([ state, district, cmplx ])
except Exception as e:
print(f"Error scraping {state}: {e}")
finally:
scraper.driver.quit()
def scrape_courts():
config = {}
m = ScraperCaseStatus(config)
m.close_modal()
csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
csv_writer.writerow(['State', 'District', 'Complex'])
states = m.scrape_states()
m.driver.close()
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
executor.submit(scrape_state_thread, state, config, csv_writer)
for state in states
]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
print(f"A thread encountered an error: {e}")
csv_writer.close()
if __name__ == '__main__':
scrape_courts()
@@ -1,0 +1,42 @@
from tempfile import TemporaryDirectory
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from tinydb import TinyDB
language = 'hin'
def to_english(input_file, output_file):
image_file_list = []
with TemporaryDirectory() as tempdir:
pdf_pages = convert_from_path(input_file, 500)
for page_enumeration, page in enumerate(pdf_pages, start=1):
filename = f"{tempdir}/page_{page_enumeration}.jpg"
page.save(filename, "JPEG")
image_file_list.append(filename)
with open(output_file, "a") as h:
for image_file in image_file_list:
text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
text = text.replace("-\n", "")
breakpoint()
h.write(text)
db = TinyDB('orders.json')
entries = db.all()
for entry in entries:
to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')
@@ -1,0 +1,2 @@
*.txt
*.mp3
@@ -1,0 +1,14 @@
import os
import whisper
def transcribe_audio(audio_file_path, model_path):
model = whisper.load_model(model_path)
result = model.transcribe(audio_file_path)
text_file_path = os.path.splitext(audio_file_path)[0] + ".txt"
with open(text_file_path, "w") as text_file:
text_file.write(result['text'])
audio_file_path = 'test.mp3'
if audio_file_path is not None:
transcribe_audio(audio_file_path, model_path='medium')
@@ -20,8 +20,14 @@
sleep(1)
def select(self, i_d, value):
sleep(1)
element = self.driver.find_element(By.ID, i_d)
while True:
try:
element = self.driver.find_element(By.ID, i_d)
break
except:
sleep(0.2)
pass
select = Select(element)
select.select_by_visible_text(value)
sleep(1)
@@ -51,6 +57,9 @@
print(f'COMPLEXES: {complexes}')
return complexes
def establishments_visible(self):
return self.driver.find_element(By.ID, 'court_est_code').is_displayed()
def scrape_establishments(self):
element = self.driver.find_element(By.ID, 'court_est_code')
@@ -5,7 +5,6 @@
from urllib import request
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
@@ -13,45 +12,30 @@
import pytesseract
import tempfile
from tinydb import TinyDB
from .scraper import Scraper
class ScraperCaseStatus(Scraper):
def __init__(self, config):
Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')
self.db = TinyDB('db.json')
self.config = config
def select_act(self):
self.select('actcode', self.config['act'])
def __init__(self):
Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index', headless=False)
def select_act(self, act):
self.select('actcode', act)
sleep(1)
self.driver.find_element(By.ID, 'radDAct').click()
self.submit_search()
def select_court(self):
sleep(2)
def goto_acts(self):
while True:
self.select('sess_state_code', self.config['state'])
self.select('sess_dist_code', self.config['district'])
self.select('court_complex_code', self.config['court_complex'])
sleep(2)
modal_is_open = self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed()
if modal_is_open:
try:
self.close_modal()
continue
break
self.select('court_est_code', self.config['court_establishment'])
element = self.driver.find_element(By.ID, 'act-tabMenu')
element.click()
break
except:
pass
def goto_acts(self):
element = self.driver.find_element(By.ID, 'act-tabMenu')
element.click()
sleep(1)
def submit_search(self):
@@ -76,9 +60,13 @@
element.clear()
else:
captcha_incomplete = False
def handle_table(self, db):
try:
table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
except:
return
def handle_table(self):
table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
self.views = []
i = 5
@@ -109,7 +97,7 @@
self.parse_orders_table()
self.db.insert(self.current_view)
db.insert(self.current_view)
print(f'INSERTED: {self.current_view}')
self.driver.find_element(By.ID, 'main_back_act').click()
i += 4
@@ -134,7 +122,7 @@
script = order.find_all('a')[0].get_attribute_list('onclick')[0]
self.driver.execute_script(script)
sleep(0.7)
sleep(1)
obj = self.driver.find_element(By.TAG_NAME, 'object')
pdf_url = str(obj.get_attribute('data'))
@@ -153,4 +141,10 @@
except:
print(f'UNABLE TO FETCH PDF: {pdf_url}')
self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
sleep(1)
while True:
try:
self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
break
except:
pass
@@ -1,0 +1,1 @@
State,District,Complex,Establishment,Records