From 3ed36b1adb0be6a450afb755e192a7198187e052 Mon Sep 17 00:00:00 2001 From: Raghuram Subramani <raghus2247@gmail.com> Date: Tue, 22 Apr 2025 20:54:47 +0530 Subject: [PATCH] update a few scripts --- flake.nix | 7 ++++++- rev/package-lock.json | 9 ++++----- rev/package.json | 2 +- rev/rev.js | 37 +++++++++++++++---------------------- scrape_ecourtindia_v6/.gitignore | 2 ++ scrape_ecourtindia_v6/create_csv.py | 4 ++-- scrape_ecourtindia_v6/create_named_pdfs.py | 3 ++- scrape_ecourtindia_v6/scrape_orders.py | 9 ++++----- scrape_ecourtindia_v6/search_for_words.py | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scrape_ecourtindia_v6/transcribe.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scrape_ecourtindia_v6/translate_to_english.py | 42 ------------------------------------------ scrape_ecourtindia_v6/modules/scraper_orders.py | 5 +++-- 12 files changed, 249 insertions(+), 82 deletions(-) diff --git a/flake.nix b/flake.nix index 838035c..8e0575f 100644 --- a/flake.nix +++ a/flake.nix @@ -11,6 +11,7 @@ p.selenium p.opencv-python p.pytesseract + p.easyocr p.beautifulsoup4 p.tinydb p.fastapi @@ -19,10 +20,14 @@ p.streamlit p.gradio - # p.pdf2image + p.pdf2image + p.argostranslate # p.openai-whisper # p.torch-bin ])) + + python3Packages.pymupdf + mupdf pyright diff --git a/rev/package-lock.json b/rev/package-lock.json index 7351f0a..5055186 100644 --- a/rev/package-lock.json +++ a/rev/package-lock.json @@ -5,15 +5,14 @@ "packages": { "": { "dependencies": { - "crypto-js": "^4.2.0", + "crypto-js": "3.1.2", "node-fetch": "^3.3.2" } }, "node_modules/crypto-js": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/crypto-js/-/crypto-js-4.2.0.tgz", - "integrity": "sha512-KALDyEYgpY+Rlob/iriUtjV6d5Eq+Y191A5g4UqLAi8CyGP9N1+FdVbkc1SxKc2r4YAYqG8JzO2KGL+AizD70Q==", - "license": "MIT" + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/crypto-js/-/crypto-js-3.1.2.tgz", + "integrity": "sha512-egolhMvFgIelOG34Goj51J6MUEMr2X8mSam6+54pXiPxcOAfRU68CgsELFsEI5hXZLk0hvUwc7y1VuHi+5RMIQ==" }, "node_modules/data-uri-to-buffer": { "version": "4.0.1", diff --git a/rev/package.json b/rev/package.json index 0446880..80c9499 100644 --- a/rev/package.json +++ a/rev/package.json @@ -1,6 +1,6 @@ { "dependencies": { - "crypto-js": "^4.2.0", + "crypto-js": "3.1.2", "node-fetch": "^3.3.2" } } diff --git a/rev/rev.js b/rev/rev.js index 2bfd9ef..d8c0ed8 100644 --- a/rev/rev.js +++ a/rev/rev.js @@ -9,17 +9,6 @@ let globaliv = "4B6250655368566D"; let globalIndex = 0; -// Utility: Check internet connection -async function checkDeviceOnlineStatus() { - try { - await dns.lookup('google.com'); - return true; - } catch { - console.error("Please check your internet connection and try again"); - return false; - } -} - // Show error message (replace alert with console) function showErrorMessage(message) { console.error("Error:", message); @@ -74,19 +63,21 @@ // API call wrapper async function callToWebService(url, data, callback) { - const online = await checkDeviceOnlineStatus(); - if (!online) return; - try { const encryptedData = encryptData(data); const headers = { 'Content-Type': 'application/json', + 'user-agent': 'eCourtsServices/2.0.1 (iPhone; iOS 18.4; Scale/3.00)' }; headers['Authorization'] = 'Bearer ' + encryptData(jwttoken); + + // const params = new URLSearchParams({ action_code: encryptedData }); + // const fullUrl = `${url}?${params.toString()}`; + const fullUrl = url; - const params = new URLSearchParams({ data: encryptedData }); - const fullUrl = `${url}?${params.toString()}`; + console.log(data); + console.log(fullUrl); const res = await fetch(fullUrl, { method: 'GET', @@ -94,6 +85,8 @@ }); const responseText = await res.text(); + + console.log(`responseText:\n${responseText}\n`) const decodedResponse = JSON.parse(decodeResponse(responseText)); if (decodedResponse.token) { @@ -108,7 +101,7 @@ const packageName = "com.eCourts.mobile"; const uidObj = { uid: "324456:" + packageName }; const newData = { ...data, ...uidObj }; - return callToWebService(url, newData, callback); + return await callToWebService(url, newData, callback); } else { showErrorMessage("Session expired!"); } @@ -133,15 +126,13 @@ // Fetch Court Complexes async function getCourtComplexes(state_code, dist_code, callback) { - const url = hostIP + "courtEstWebService.php"; - const data = { - action_code: "fillCourtComplex", - state_code, - dist_code - }; + const url = hostIP + "appReleaseWebService.php"; + let data = 'fillState'; await callToWebService(url, data, callback); } getCourtComplexes("1", "101", (res) => { console.log("Court Complexes:", res.courtComplex); }); + +console.log(decodeResponse('POaJ42M9nP6pkEJim6CFmQ==')); diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore index 1aed0d4..36f0da5 100644 --- a/scrape_ecourtindia_v6/.gitignore +++ a/scrape_ecourtindia_v6/.gitignore @@ -6,3 +6,5 @@ bak/ translated/* *.json +transcribed/* +txt/* diff --git a/scrape_ecourtindia_v6/create_csv.py b/scrape_ecourtindia_v6/create_csv.py index 5561b73..1bf8860 100644 --- a/scrape_ecourtindia_v6/create_csv.py +++ a/scrape_ecourtindia_v6/create_csv.py @@ -6,10 +6,10 @@ csvfile = open('orders.csv', 'w', newline='') w = csv.writer(csvfile) -w.writerow(['Court Name', 'Case Info', 'Petitioner/Respondent', 'Date', 'File']) +w.writerow(['District', 'Court Name', 'Case Info', 'Petitioner/Respondent', 'Date', 'File']) for entry in entries: - ent = [entry['court_name'], entry['case_info'], entry['petitioner_respondent'], entry['date'], f'http://aarch.compromyse.xyz:8000/{entry["filename"]}'] + ent = [entry['district'], entry['court_name'], entry['case_info'], entry['petitioner_respondent'], entry['date'], f'http://aarch.compromyse.xyz:8000/{entry["filename"]}'] w.writerow(ent) csvfile.close() diff --git a/scrape_ecourtindia_v6/create_named_pdfs.py b/scrape_ecourtindia_v6/create_named_pdfs.py index c47c66e..a37fc10 100644 --- a/scrape_ecourtindia_v6/create_named_pdfs.py +++ a/scrape_ecourtindia_v6/create_named_pdfs.py @@ -13,11 +13,12 @@ entries = db.all() for entry in entries: + district = sanitize_filename(entry['district']) date = sanitize_filename(entry['date']) case_info = sanitize_filename(entry['case_info']) court_name = sanitize_filename(entry['court_name']) - newname = f"named_pdf/{date}---{case_info}---{court_name}.pdf" + newname = f"named_pdf/{district}---{date}---{case_info}---{court_name}.pdf" try: shutil.copyfile(entry['filename'], newname) diff --git a/scrape_ecourtindia_v6/scrape_orders.py b/scrape_ecourtindia_v6/scrape_orders.py index 146119e..e254967 100644 --- a/scrape_ecourtindia_v6/scrape_orders.py +++ a/scrape_ecourtindia_v6/scrape_orders.py @@ -51,7 +51,7 @@ scraper.submit_search() scraper.parse_orders_table() - scraper.handle_orders(row[3]) + scraper.handle_orders(row[3], row[1]) scraper.driver.quit() @@ -63,7 +63,7 @@ reader = csv.reader(csvfile) courts = list(reader) - with ThreadPoolExecutor(max_workers=5) as executor: + with ThreadPoolExecutor(max_workers=1) as executor: futures = [ executor.submit(scrape_single_court, court) for court in courts @@ -75,6 +75,5 @@ except Exception as e: print(f"A thread encountered an error: {e}") -if __name__ == '__main__': - input_file = 'csv/2023-24_pocso.csv' - scrape_orders(input_file) +input_file = 'csv/2023-24_pocso_all_districts.csv' +scrape_orders(input_file) diff --git a/scrape_ecourtindia_v6/search_for_words.py b/scrape_ecourtindia_v6/search_for_words.py new file mode 100644 index 0000000..effcea9 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/search_for_words.py @@ -1,0 +1,109 @@ +import os +import csv +import re +import argostranslate.translate + +# Load Argos Translate model (assumes it's already installed) +installed_languages = argostranslate.translate.load_installed_languages() +hi_lang = next(filter(lambda x: x.code == "hi", installed_languages)) +en_lang = next(filter(lambda x: x.code == "en", installed_languages)) +translator = hi_lang.get_translation(en_lang) + +# Hindi phrases to search +phrases = [ + "किशोर", + "किशोर नà¥à¤¯à¤¾à¤¯à¤¾à¤²à¤¯", + "बोरà¥à¤¡", + "पà¥à¤°à¤¾à¤°à¤‚à¤à¤¿à¤• आकलन", + "पà¥à¤°à¤¾à¤°à¤‚à¤à¤¿à¤• निरà¥à¤§à¤¾à¤°à¤£", + "बालक" +] + +main_phrases = ["किशोर", "किशोर नà¥à¤¯à¤¾à¤¯à¤¾à¤²à¤¯"] + +input_dir = "txt" +output_csv_hindi = "output_hindi.csv" +output_csv_english = "output_english.csv" +base_url = "https://aarch.compromyse.xyz:8000/txt/" + +# Extract up to 10 snippets for a phrase +def extract_snippets(text, phrase, window=10, max_count=10): + words = text.split() + snippets = [] + for i, word in enumerate(words): + if phrase in word: + start = max(0, i - window) + end = min(len(words), i + window + 1) + snippet = ' '.join(words[start:end]) + snippets.append(snippet) + if len(snippets) >= max_count: + break + return snippets + +# CSV header +header = ["File", "File URL"] +for phrase in phrases: + header.append(f"{phrase} Present") + if phrase in main_phrases: + for i in range(1, 11): + header.append(f"{phrase} Snippet {i}") + else: + header.append(f"{phrase} Snippet") + +# Process files +results = [] +for filename in os.listdir(input_dir): + if filename.endswith(".txt"): + filepath = os.path.join(input_dir, filename) + with open(filepath, 'r', encoding='utf-8') as f: + text = f.read() + file_url = base_url + filename + row = [filename, file_url] + + for phrase in phrases: + found = phrase in text + row.append("Yes" if found else "No") + + if found: + snippets = extract_snippets(text, phrase, max_count=10) + if phrase in main_phrases: + row.extend(snippets + [""] * (10 - len(snippets))) + else: + row.append(snippets[0] if snippets else "") + else: + if phrase in main_phrases: + row.extend([""] * 10) + else: + row.append("") + results.append(row) + +# Write Hindi CSV +with open(output_csv_hindi, 'w', encoding='utf-8-sig', newline='') as f: + writer = csv.writer(f) + writer.writerow(header) + writer.writerows(results) + +# Translate header +translated_header = [translator.translate(cell) if re.search(r'[\u0900-\u097F]', cell) else cell for cell in header] + +# Translate rows +translated_rows = [translated_header] +for row in results: + translated_row = [] + for cell in row: + try: + if re.search(r'[\u0900-\u097F]', cell): # Only translate if Hindi detected + translated_row.append(translator.translate(cell)) + else: + translated_row.append(cell) + except: + translated_row.append(cell) + translated_rows.append(translated_row) + +# Write English CSV +with open(output_csv_english, 'w', encoding='utf-8-sig', newline='') as f: + writer = csv.writer(f) + writer.writerows(translated_rows) + +print(f"✅ Hindi CSV saved to: {output_csv_hindi}") +print(f"✅ English CSV saved to: {output_csv_english}") diff --git a/scrape_ecourtindia_v6/transcribe.py b/scrape_ecourtindia_v6/transcribe.py new file mode 100644 index 0000000..80f5094 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/transcribe.py @@ -1,0 +1,102 @@ +import os +import easyocr +import shutil +import csv +from pdf2image import convert_from_path +# import pytesseract +from concurrent.futures import ThreadPoolExecutor, as_completed + +def read_csv_filenames(csv_path): + filenames = set() + with open(csv_path, newline='', encoding='utf-8') as csvfile: + reader = csv.reader(csvfile) + for row in reader: + if len(row) >= 4: + filename = row[4].strip() + if filename.lower().endswith('.pdf'): + filenames.add(filename) + return filenames + +def process_pdf(pdf_path, output_folder, dpi=300, lang='hi'): + reader = easyocr.Reader(['hi'], gpu=True) # 'hi' is for Hindi + pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] + pdf_output_dir = os.path.join(output_folder, pdf_name) + images_dir = os.path.join(pdf_output_dir, "images") + + os.makedirs(images_dir, exist_ok=True) + + try: + images = convert_from_path(pdf_path, dpi=dpi) + ocr_texts = [] + + for i, image in enumerate(images): + image_path = os.path.join(images_dir, f"page_{i+1}.png") + image.save(image_path, "PNG") + + # GPU-accelerated OCR + result = reader.readtext(image_path, detail=0) + text = "\n".join(result) + + ocr_texts.append(f"--- Page {i+1} ---\n{text.strip()}\n") + + ocr_output_path = os.path.join(pdf_output_dir, "ocr_output.txt") + with open(ocr_output_path, "w", encoding="utf-8") as f: + f.write("\n".join(ocr_texts)) + + print(f"✅ Processed with GPU: {pdf_path} → {ocr_output_path}") + except Exception as e: + print(f"⌠Error processing {pdf_path}: {e}") + +def collect_txt_files(base_output_folder, destination_folder): + os.makedirs(destination_folder, exist_ok=True) + for root, dirs, files in os.walk(base_output_folder): + for file in files: + if file == "ocr_output.txt": + full_path = os.path.join(root, file) + new_name = os.path.basename(os.path.dirname(full_path)) + ".txt" + dest_path = os.path.join(destination_folder, new_name) + shutil.copy(full_path, dest_path) + print(f"📠Copied: {full_path} → {dest_path}") + +def batch_process_folder(input_folder, output_folder, csv_path, dpi=300, lang='hi', max_threads=32): + os.makedirs(output_folder, exist_ok=True) + + # Read allowed filenames from the CSV + valid_filenames = read_csv_filenames(csv_path) + + # Only include matching PDF files + pdf_files = [ + os.path.join(input_folder, filename) + for filename in os.listdir(input_folder) + if filename in valid_filenames + ] + + print(f'number_of_files: {len(pdf_files)}') + + if not pdf_files: + print("âš ï¸ No matching PDF files found in input folder.") + return + + with ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = { + executor.submit(process_pdf, pdf_path, output_folder, dpi, lang): pdf_path + for pdf_path in pdf_files + } + + for future in as_completed(futures): + pdf_path = futures[future] + try: + future.result() + except Exception as e: + print(f"âš ï¸ Failed to process {pdf_path}: {e}") + + # collect_txt_files(output_folder, os.path.join(output_folder, "all_texts")) + +# Set your actual folders and CSV path +input_folder = "pdf" +output_folder = "transcribed" +csv_path = "files.csv" + +# Run batch processing with CSV filtering +# batch_process_folder(input_folder, output_folder, csv_path, lang='hin', max_threads=2) +collect_txt_files(output_folder, os.path.join(output_folder, "all_texts")) diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py deleted file mode 100644 index 485a4b8..0000000 100644 --- a/scrape_ecourtindia_v6/translate_to_english.py +++ /dev/null @@ -1,42 +1,0 @@ -from tempfile import TemporaryDirectory - -import pytesseract -from pdf2image import convert_from_path -from PIL import Image - -from tinydb import TinyDB - -language = 'hin' - -def to_english(input_file, output_file): - image_file_list = [] - - with TemporaryDirectory() as tempdir: - pdf_pages = convert_from_path(input_file, 500) - - for page_enumeration, page in enumerate(pdf_pages, start=1): - filename = f"{tempdir}/page_{page_enumeration}.jpg" - page.save(filename, "JPEG") - image_file_list.append(filename) - - with open(output_file, "a") as h: - for image_file in image_file_list: - text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language)))) - - # In many PDFs, at line ending, if a word can't - # be written fully, a 'hyphen' is added. - # The rest of the word is written in the next line - # Eg: This is a sample text this word here GeeksF- - # orGeeks is half on first line, remaining on next. - # To remove this, we replace every '-\n' to ''. - text = text.replace("-\n", "") - - breakpoint() - - h.write(text) - -db = TinyDB('orders.json') -entries = db.all() - -for entry in entries: - to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt') diff --git a/scrape_ecourtindia_v6/modules/scraper_orders.py b/scrape_ecourtindia_v6/modules/scraper_orders.py index d0b8df3..0a54a91 100644 --- a/scrape_ecourtindia_v6/modules/scraper_orders.py +++ a/scrape_ecourtindia_v6/modules/scraper_orders.py @@ -71,7 +71,7 @@ self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ]) i += 5 - def handle_orders(self, court_name): + def handle_orders(self, court_name, district): for row in self.rows: order = row[0] @@ -97,7 +97,8 @@ except: print(f'UNABLE TO FETCH PDF: {pdf_url}') - record = { 'court_name': court_name, 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename } + record = { 'district': district, 'court_name': court_name, 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename } self.db.insert(record) + sleep(0.7) self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() -- rgit 0.1.5