update a few scripts
Diff
flake.nix | 7 ++++++-
rev/package-lock.json | 9 ++++-----
rev/package.json | 2 +-
rev/rev.js | 37 +++++++++++++++----------------------
scrape_ecourtindia_v6/.gitignore | 2 ++
scrape_ecourtindia_v6/create_csv.py | 4 ++--
scrape_ecourtindia_v6/create_named_pdfs.py | 3 ++-
scrape_ecourtindia_v6/scrape_orders.py | 9 ++++-----
scrape_ecourtindia_v6/search_for_words.py | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
scrape_ecourtindia_v6/transcribe.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
scrape_ecourtindia_v6/translate_to_english.py | 42 ------------------------------------------
scrape_ecourtindia_v6/modules/scraper_orders.py | 5 +++--
12 files changed, 249 insertions(+), 82 deletions(-)
@@ -11,6 +11,7 @@
p.selenium
p.opencv-python
p.pytesseract
p.easyocr
p.beautifulsoup4
p.tinydb
p.fastapi
@@ -19,10 +20,14 @@
p.streamlit
p.gradio
p.pdf2image
p.argostranslate
]))
python3Packages.pymupdf
mupdf
pyright
@@ -5,15 +5,14 @@
"packages": {
"": {
"dependencies": {
"crypto-js": "^4.2.0",
"crypto-js": "3.1.2",
"node-fetch": "^3.3.2"
}
},
"node_modules/crypto-js": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/crypto-js/-/crypto-js-4.2.0.tgz",
"integrity": "sha512-KALDyEYgpY+Rlob/iriUtjV6d5Eq+Y191A5g4UqLAi8CyGP9N1+FdVbkc1SxKc2r4YAYqG8JzO2KGL+AizD70Q==",
"license": "MIT"
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/crypto-js/-/crypto-js-3.1.2.tgz",
"integrity": "sha512-egolhMvFgIelOG34Goj51J6MUEMr2X8mSam6+54pXiPxcOAfRU68CgsELFsEI5hXZLk0hvUwc7y1VuHi+5RMIQ=="
},
"node_modules/data-uri-to-buffer": {
"version": "4.0.1",
@@ -1,6 +1,6 @@
{
"dependencies": {
"crypto-js": "^4.2.0",
"crypto-js": "3.1.2",
"node-fetch": "^3.3.2"
}
}
@@ -9,17 +9,6 @@
let globaliv = "4B6250655368566D";
let globalIndex = 0;
async function checkDeviceOnlineStatus() {
try {
await dns.lookup('google.com');
return true;
} catch {
console.error("Please check your internet connection and try again");
return false;
}
}
function showErrorMessage(message) {
console.error("Error:", message);
@@ -74,19 +63,21 @@
async function callToWebService(url, data, callback) {
const online = await checkDeviceOnlineStatus();
if (!online) return;
try {
const encryptedData = encryptData(data);
const headers = {
'Content-Type': 'application/json',
'user-agent': 'eCourtsServices/2.0.1 (iPhone; iOS 18.4; Scale/3.00)'
};
headers['Authorization'] = 'Bearer ' + encryptData(jwttoken);
const fullUrl = url;
const params = new URLSearchParams({ data: encryptedData });
const fullUrl = `${url}?${params.toString()}`;
console.log(data);
console.log(fullUrl);
const res = await fetch(fullUrl, {
method: 'GET',
@@ -94,6 +85,8 @@
});
const responseText = await res.text();
console.log(`responseText:\n${responseText}\n`)
const decodedResponse = JSON.parse(decodeResponse(responseText));
if (decodedResponse.token) {
@@ -108,7 +101,7 @@
const packageName = "com.eCourts.mobile";
const uidObj = { uid: "324456:" + packageName };
const newData = { ...data, ...uidObj };
return callToWebService(url, newData, callback);
return await callToWebService(url, newData, callback);
} else {
showErrorMessage("Session expired!");
}
@@ -133,15 +126,13 @@
async function getCourtComplexes(state_code, dist_code, callback) {
const url = hostIP + "courtEstWebService.php";
const data = {
action_code: "fillCourtComplex",
state_code,
dist_code
};
const url = hostIP + "appReleaseWebService.php";
let data = 'fillState';
await callToWebService(url, data, callback);
}
getCourtComplexes("1", "101", (res) => {
console.log("Court Complexes:", res.courtComplex);
});
console.log(decodeResponse('POaJ42M9nP6pkEJim6CFmQ=='));
@@ -6,3 +6,5 @@
bak/
translated/*
*.json
transcribed/*
txt/*
@@ -6,10 +6,10 @@
csvfile = open('orders.csv', 'w', newline='')
w = csv.writer(csvfile)
w.writerow(['Court Name', 'Case Info', 'Petitioner/Respondent', 'Date', 'File'])
w.writerow(['District', 'Court Name', 'Case Info', 'Petitioner/Respondent', 'Date', 'File'])
for entry in entries:
ent = [entry['court_name'], entry['case_info'], entry['petitioner_respondent'], entry['date'], f'http://aarch.compromyse.xyz:8000/{entry["filename"]}']
ent = [entry['district'], entry['court_name'], entry['case_info'], entry['petitioner_respondent'], entry['date'], f'http://aarch.compromyse.xyz:8000/{entry["filename"]}']
w.writerow(ent)
csvfile.close()
@@ -13,11 +13,12 @@
entries = db.all()
for entry in entries:
district = sanitize_filename(entry['district'])
date = sanitize_filename(entry['date'])
case_info = sanitize_filename(entry['case_info'])
court_name = sanitize_filename(entry['court_name'])
newname = f"named_pdf/{date}---{case_info}---{court_name}.pdf"
newname = f"named_pdf/{district}---{date}---{case_info}---{court_name}.pdf"
try:
shutil.copyfile(entry['filename'], newname)
@@ -51,7 +51,7 @@
scraper.submit_search()
scraper.parse_orders_table()
scraper.handle_orders(row[3])
scraper.handle_orders(row[3], row[1])
scraper.driver.quit()
@@ -63,7 +63,7 @@
reader = csv.reader(csvfile)
courts = list(reader)
with ThreadPoolExecutor(max_workers=5) as executor:
with ThreadPoolExecutor(max_workers=1) as executor:
futures = [
executor.submit(scrape_single_court, court)
for court in courts
@@ -75,6 +75,5 @@
except Exception as e:
print(f"A thread encountered an error: {e}")
if __name__ == '__main__':
input_file = 'csv/2023-24_pocso.csv'
scrape_orders(input_file)
input_file = 'csv/2023-24_pocso_all_districts.csv'
scrape_orders(input_file)
@@ -1,0 +1,109 @@
import os
import csv
import re
import argostranslate.translate
installed_languages = argostranslate.translate.load_installed_languages()
hi_lang = next(filter(lambda x: x.code == "hi", installed_languages))
en_lang = next(filter(lambda x: x.code == "en", installed_languages))
translator = hi_lang.get_translation(en_lang)
phrases = [
"किशोर",
"किशोर न्यायालय",
"बोर्ड",
"प्रारंभिक आकलन",
"प्रारंभिक निर्धारण",
"बालक"
]
main_phrases = ["किशोर", "किशोर न्यायालय"]
input_dir = "txt"
output_csv_hindi = "output_hindi.csv"
output_csv_english = "output_english.csv"
base_url = "https://aarch.compromyse.xyz:8000/txt/"
def extract_snippets(text, phrase, window=10, max_count=10):
words = text.split()
snippets = []
for i, word in enumerate(words):
if phrase in word:
start = max(0, i - window)
end = min(len(words), i + window + 1)
snippet = ' '.join(words[start:end])
snippets.append(snippet)
if len(snippets) >= max_count:
break
return snippets
header = ["File", "File URL"]
for phrase in phrases:
header.append(f"{phrase} Present")
if phrase in main_phrases:
for i in range(1, 11):
header.append(f"{phrase} Snippet {i}")
else:
header.append(f"{phrase} Snippet")
results = []
for filename in os.listdir(input_dir):
if filename.endswith(".txt"):
filepath = os.path.join(input_dir, filename)
with open(filepath, 'r', encoding='utf-8') as f:
text = f.read()
file_url = base_url + filename
row = [filename, file_url]
for phrase in phrases:
found = phrase in text
row.append("Yes" if found else "No")
if found:
snippets = extract_snippets(text, phrase, max_count=10)
if phrase in main_phrases:
row.extend(snippets + [""] * (10 - len(snippets)))
else:
row.append(snippets[0] if snippets else "")
else:
if phrase in main_phrases:
row.extend([""] * 10)
else:
row.append("")
results.append(row)
with open(output_csv_hindi, 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(results)
translated_header = [translator.translate(cell) if re.search(r'[\u0900-\u097F]', cell) else cell for cell in header]
translated_rows = [translated_header]
for row in results:
translated_row = []
for cell in row:
try:
if re.search(r'[\u0900-\u097F]', cell):
translated_row.append(translator.translate(cell))
else:
translated_row.append(cell)
except:
translated_row.append(cell)
translated_rows.append(translated_row)
with open(output_csv_english, 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerows(translated_rows)
print(f"✅ Hindi CSV saved to: {output_csv_hindi}")
print(f"✅ English CSV saved to: {output_csv_english}")
@@ -1,0 +1,102 @@
import os
import easyocr
import shutil
import csv
from pdf2image import convert_from_path
from concurrent.futures import ThreadPoolExecutor, as_completed
def read_csv_filenames(csv_path):
filenames = set()
with open(csv_path, newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
if len(row) >= 4:
filename = row[4].strip()
if filename.lower().endswith('.pdf'):
filenames.add(filename)
return filenames
def process_pdf(pdf_path, output_folder, dpi=300, lang='hi'):
reader = easyocr.Reader(['hi'], gpu=True)
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
pdf_output_dir = os.path.join(output_folder, pdf_name)
images_dir = os.path.join(pdf_output_dir, "images")
os.makedirs(images_dir, exist_ok=True)
try:
images = convert_from_path(pdf_path, dpi=dpi)
ocr_texts = []
for i, image in enumerate(images):
image_path = os.path.join(images_dir, f"page_{i+1}.png")
image.save(image_path, "PNG")
result = reader.readtext(image_path, detail=0)
text = "\n".join(result)
ocr_texts.append(f"--- Page {i+1} ---\n{text.strip()}\n")
ocr_output_path = os.path.join(pdf_output_dir, "ocr_output.txt")
with open(ocr_output_path, "w", encoding="utf-8") as f:
f.write("\n".join(ocr_texts))
print(f"✅ Processed with GPU: {pdf_path} → {ocr_output_path}")
except Exception as e:
print(f"❌ Error processing {pdf_path}: {e}")
def collect_txt_files(base_output_folder, destination_folder):
os.makedirs(destination_folder, exist_ok=True)
for root, dirs, files in os.walk(base_output_folder):
for file in files:
if file == "ocr_output.txt":
full_path = os.path.join(root, file)
new_name = os.path.basename(os.path.dirname(full_path)) + ".txt"
dest_path = os.path.join(destination_folder, new_name)
shutil.copy(full_path, dest_path)
print(f"📁 Copied: {full_path} → {dest_path}")
def batch_process_folder(input_folder, output_folder, csv_path, dpi=300, lang='hi', max_threads=32):
os.makedirs(output_folder, exist_ok=True)
valid_filenames = read_csv_filenames(csv_path)
pdf_files = [
os.path.join(input_folder, filename)
for filename in os.listdir(input_folder)
if filename in valid_filenames
]
print(f'number_of_files: {len(pdf_files)}')
if not pdf_files:
print("⚠️ No matching PDF files found in input folder.")
return
with ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = {
executor.submit(process_pdf, pdf_path, output_folder, dpi, lang): pdf_path
for pdf_path in pdf_files
}
for future in as_completed(futures):
pdf_path = futures[future]
try:
future.result()
except Exception as e:
print(f"⚠️ Failed to process {pdf_path}: {e}")
input_folder = "pdf"
output_folder = "transcribed"
csv_path = "files.csv"
collect_txt_files(output_folder, os.path.join(output_folder, "all_texts"))
@@ -1,42 +1,0 @@
from tempfile import TemporaryDirectory
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from tinydb import TinyDB
language = 'hin'
def to_english(input_file, output_file):
image_file_list = []
with TemporaryDirectory() as tempdir:
pdf_pages = convert_from_path(input_file, 500)
for page_enumeration, page in enumerate(pdf_pages, start=1):
filename = f"{tempdir}/page_{page_enumeration}.jpg"
page.save(filename, "JPEG")
image_file_list.append(filename)
with open(output_file, "a") as h:
for image_file in image_file_list:
text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
text = text.replace("-\n", "")
breakpoint()
h.write(text)
db = TinyDB('orders.json')
entries = db.all()
for entry in entries:
to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')
@@ -71,7 +71,7 @@
self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ])
i += 5
def handle_orders(self, court_name):
def handle_orders(self, court_name, district):
for row in self.rows:
order = row[0]
@@ -97,7 +97,8 @@
except:
print(f'UNABLE TO FETCH PDF: {pdf_url}')
record = { 'court_name': court_name, 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename }
record = { 'district': district, 'court_name': court_name, 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename }
self.db.insert(record)
sleep(0.7)
self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()