🏡 index : github.com/compromyse/enfold.git

author Raghuram Subramani <raghus2247@gmail.com> 2025-04-22 20:54:47.0 +05:30:00
committer Raghuram Subramani <raghus2247@gmail.com> 2025-04-22 20:54:47.0 +05:30:00
commit
3ed36b1adb0be6a450afb755e192a7198187e052 [patch]
tree
635a10c9540bb5e93a4433d53a4279c68802157c
parent
c5d8880d6419e48b5c1450a5c1236576a47d2ac8
download
3ed36b1adb0be6a450afb755e192a7198187e052.tar.gz

update a few scripts



Diff

 flake.nix                                       |   7 ++++++-
 rev/package-lock.json                           |   9 ++++-----
 rev/package.json                                |   2 +-
 rev/rev.js                                      |  37 +++++++++++++++----------------------
 scrape_ecourtindia_v6/.gitignore                |   2 ++
 scrape_ecourtindia_v6/create_csv.py             |   4 ++--
 scrape_ecourtindia_v6/create_named_pdfs.py      |   3 ++-
 scrape_ecourtindia_v6/scrape_orders.py          |   9 ++++-----
 scrape_ecourtindia_v6/search_for_words.py       | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 scrape_ecourtindia_v6/transcribe.py             | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 scrape_ecourtindia_v6/translate_to_english.py   |  42 ------------------------------------------
 scrape_ecourtindia_v6/modules/scraper_orders.py |   5 +++--
 12 files changed, 249 insertions(+), 82 deletions(-)

diff --git a/flake.nix b/flake.nix
index 838035c..8e0575f 100644
--- a/flake.nix
+++ a/flake.nix
@@ -11,6 +11,7 @@
          p.selenium
          p.opencv-python
          p.pytesseract
          p.easyocr
          p.beautifulsoup4
          p.tinydb
          p.fastapi
@@ -19,10 +20,14 @@
          p.streamlit
          p.gradio

          # p.pdf2image
          p.pdf2image
          p.argostranslate
          # p.openai-whisper
          # p.torch-bin
        ]))

        python3Packages.pymupdf
        mupdf

        pyright

diff --git a/rev/package-lock.json b/rev/package-lock.json
index 7351f0a..5055186 100644
--- a/rev/package-lock.json
+++ a/rev/package-lock.json
@@ -5,15 +5,14 @@
  "packages": {
    "": {
      "dependencies": {
        "crypto-js": "^4.2.0",
        "crypto-js": "3.1.2",
        "node-fetch": "^3.3.2"
      }

    },

    "node_modules/crypto-js": {
      "version": "4.2.0",
      "resolved": "https://registry.npmjs.org/crypto-js/-/crypto-js-4.2.0.tgz",
      "integrity": "sha512-KALDyEYgpY+Rlob/iriUtjV6d5Eq+Y191A5g4UqLAi8CyGP9N1+FdVbkc1SxKc2r4YAYqG8JzO2KGL+AizD70Q==",
      "license": "MIT"
      "version": "3.1.2",
      "resolved": "https://registry.npmjs.org/crypto-js/-/crypto-js-3.1.2.tgz",
      "integrity": "sha512-egolhMvFgIelOG34Goj51J6MUEMr2X8mSam6+54pXiPxcOAfRU68CgsELFsEI5hXZLk0hvUwc7y1VuHi+5RMIQ=="
    },

    "node_modules/data-uri-to-buffer": {
      "version": "4.0.1",
diff --git a/rev/package.json b/rev/package.json
index 0446880..80c9499 100644
--- a/rev/package.json
+++ a/rev/package.json
@@ -1,6 +1,6 @@
{
  "dependencies": {
    "crypto-js": "^4.2.0",
    "crypto-js": "3.1.2",
    "node-fetch": "^3.3.2"
  }

}

diff --git a/rev/rev.js b/rev/rev.js
index 2bfd9ef..d8c0ed8 100644
--- a/rev/rev.js
+++ a/rev/rev.js
@@ -9,17 +9,6 @@
let globaliv = "4B6250655368566D";
let globalIndex = 0;

// Utility: Check internet connection
async function checkDeviceOnlineStatus() {
  try {
    await dns.lookup('google.com');
    return true;
  } catch {
    console.error("Please check your internet connection and try again");
    return false;
  }
}

// Show error message (replace alert with console)
function showErrorMessage(message) {
  console.error("Error:", message);
@@ -74,19 +63,21 @@

// API call wrapper
async function callToWebService(url, data, callback) {
  const online = await checkDeviceOnlineStatus();
  if (!online) return;

  try {
    const encryptedData = encryptData(data);
    const headers = {
      'Content-Type': 'application/json',
      'user-agent': 'eCourtsServices/2.0.1 (iPhone; iOS 18.4; Scale/3.00)'
    };

    headers['Authorization'] = 'Bearer ' + encryptData(jwttoken);

    // const params = new URLSearchParams({ action_code: encryptedData });
    // const fullUrl = `${url}?${params.toString()}`;
    const fullUrl = url;

    const params = new URLSearchParams({ data: encryptedData });
    const fullUrl = `${url}?${params.toString()}`;
    console.log(data);
    console.log(fullUrl);

    const res = await fetch(fullUrl, {
      method: 'GET',
@@ -94,6 +85,8 @@
    });

    const responseText = await res.text();

    console.log(`responseText:\n${responseText}\n`)
    const decodedResponse = JSON.parse(decodeResponse(responseText));

    if (decodedResponse.token) {
@@ -108,7 +101,7 @@
          const packageName = "com.eCourts.mobile";
          const uidObj = { uid: "324456:" + packageName };
          const newData = { ...data, ...uidObj };
          return callToWebService(url, newData, callback);
          return await callToWebService(url, newData, callback);
        } else {
          showErrorMessage("Session expired!");
        }
@@ -133,15 +126,13 @@

// Fetch Court Complexes
async function getCourtComplexes(state_code, dist_code, callback) {
  const url = hostIP + "courtEstWebService.php";
  const data = {
    action_code: "fillCourtComplex",
    state_code,
    dist_code
  };
  const url = hostIP + "appReleaseWebService.php";
  let data = 'fillState';
  await callToWebService(url, data, callback);
}

getCourtComplexes("1", "101", (res) => {
  console.log("Court Complexes:", res.courtComplex);
});

console.log(decodeResponse('POaJ42M9nP6pkEJim6CFmQ=='));
diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore
index 1aed0d4..36f0da5 100644
--- a/scrape_ecourtindia_v6/.gitignore
+++ a/scrape_ecourtindia_v6/.gitignore
@@ -6,3 +6,5 @@
bak/
translated/*
*.json
transcribed/*
txt/*
diff --git a/scrape_ecourtindia_v6/create_csv.py b/scrape_ecourtindia_v6/create_csv.py
index 5561b73..1bf8860 100644
--- a/scrape_ecourtindia_v6/create_csv.py
+++ a/scrape_ecourtindia_v6/create_csv.py
@@ -6,10 +6,10 @@

csvfile = open('orders.csv', 'w', newline='')
w = csv.writer(csvfile)
w.writerow(['Court Name', 'Case Info', 'Petitioner/Respondent', 'Date', 'File'])
w.writerow(['District', 'Court Name', 'Case Info', 'Petitioner/Respondent', 'Date', 'File'])

for entry in entries:
    ent = [entry['court_name'], entry['case_info'], entry['petitioner_respondent'], entry['date'], f'http://aarch.compromyse.xyz:8000/{entry["filename"]}']
    ent = [entry['district'], entry['court_name'], entry['case_info'], entry['petitioner_respondent'], entry['date'], f'http://aarch.compromyse.xyz:8000/{entry["filename"]}']
    w.writerow(ent)

csvfile.close()
diff --git a/scrape_ecourtindia_v6/create_named_pdfs.py b/scrape_ecourtindia_v6/create_named_pdfs.py
index c47c66e..a37fc10 100644
--- a/scrape_ecourtindia_v6/create_named_pdfs.py
+++ a/scrape_ecourtindia_v6/create_named_pdfs.py
@@ -13,11 +13,12 @@
entries = db.all()

for entry in entries:
    district = sanitize_filename(entry['district'])
    date = sanitize_filename(entry['date'])
    case_info = sanitize_filename(entry['case_info'])
    court_name = sanitize_filename(entry['court_name'])
    
    newname = f"named_pdf/{date}---{case_info}---{court_name}.pdf"
    newname = f"named_pdf/{district}---{date}---{case_info}---{court_name}.pdf"
    
    try:
        shutil.copyfile(entry['filename'], newname)
diff --git a/scrape_ecourtindia_v6/scrape_orders.py b/scrape_ecourtindia_v6/scrape_orders.py
index 146119e..e254967 100644
--- a/scrape_ecourtindia_v6/scrape_orders.py
+++ a/scrape_ecourtindia_v6/scrape_orders.py
@@ -51,7 +51,7 @@
        
        scraper.submit_search()
        scraper.parse_orders_table()
        scraper.handle_orders(row[3])
        scraper.handle_orders(row[3], row[1])
        
        scraper.driver.quit()
    
@@ -63,7 +63,7 @@
        reader = csv.reader(csvfile)
        courts = list(reader)
    
    with ThreadPoolExecutor(max_workers=5) as executor:
    with ThreadPoolExecutor(max_workers=1) as executor:
        futures = [
            executor.submit(scrape_single_court, court) 
            for court in courts
@@ -75,6 +75,5 @@
            except Exception as e:
                print(f"A thread encountered an error: {e}")
    
if __name__ == '__main__':
    input_file = 'csv/2023-24_pocso.csv'
    scrape_orders(input_file)
input_file = 'csv/2023-24_pocso_all_districts.csv'
scrape_orders(input_file)
diff --git a/scrape_ecourtindia_v6/search_for_words.py b/scrape_ecourtindia_v6/search_for_words.py
new file mode 100644
index 0000000..effcea9 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/search_for_words.py
@@ -1,0 +1,109 @@
import os
import csv
import re
import argostranslate.translate

# Load Argos Translate model (assumes it's already installed)
installed_languages = argostranslate.translate.load_installed_languages()
hi_lang = next(filter(lambda x: x.code == "hi", installed_languages))
en_lang = next(filter(lambda x: x.code == "en", installed_languages))
translator = hi_lang.get_translation(en_lang)

# Hindi phrases to search
phrases = [
    "किशोर",
    "किशोर न्यायालय",
    "बोर्ड",
    "प्रारंभिक आकलन",
    "प्रारंभिक निर्धारण",
    "बालक"
]

main_phrases = ["किशोर", "किशोर न्यायालय"]

input_dir = "txt"
output_csv_hindi = "output_hindi.csv"
output_csv_english = "output_english.csv"
base_url = "https://aarch.compromyse.xyz:8000/txt/"

# Extract up to 10 snippets for a phrase
def extract_snippets(text, phrase, window=10, max_count=10):
    words = text.split()
    snippets = []
    for i, word in enumerate(words):
        if phrase in word:
            start = max(0, i - window)
            end = min(len(words), i + window + 1)
            snippet = ' '.join(words[start:end])
            snippets.append(snippet)
            if len(snippets) >= max_count:
                break
    return snippets

# CSV header
header = ["File", "File URL"]
for phrase in phrases:
    header.append(f"{phrase} Present")
    if phrase in main_phrases:
        for i in range(1, 11):
            header.append(f"{phrase} Snippet {i}")
    else:
        header.append(f"{phrase} Snippet")

# Process files
results = []
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(input_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
            file_url = base_url + filename
            row = [filename, file_url]

            for phrase in phrases:
                found = phrase in text
                row.append("Yes" if found else "No")

                if found:
                    snippets = extract_snippets(text, phrase, max_count=10)
                    if phrase in main_phrases:
                        row.extend(snippets + [""] * (10 - len(snippets)))
                    else:
                        row.append(snippets[0] if snippets else "")
                else:
                    if phrase in main_phrases:
                        row.extend([""] * 10)
                    else:
                        row.append("")
            results.append(row)

# Write Hindi CSV
with open(output_csv_hindi, 'w', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(results)

# Translate header
translated_header = [translator.translate(cell) if re.search(r'[\u0900-\u097F]', cell) else cell for cell in header]

# Translate rows
translated_rows = [translated_header]
for row in results:
    translated_row = []
    for cell in row:
        try:
            if re.search(r'[\u0900-\u097F]', cell):  # Only translate if Hindi detected
                translated_row.append(translator.translate(cell))
            else:
                translated_row.append(cell)
        except:
            translated_row.append(cell)
    translated_rows.append(translated_row)

# Write English CSV
with open(output_csv_english, 'w', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(translated_rows)

print(f"✅ Hindi CSV saved to: {output_csv_hindi}")
print(f"✅ English CSV saved to: {output_csv_english}")
diff --git a/scrape_ecourtindia_v6/transcribe.py b/scrape_ecourtindia_v6/transcribe.py
new file mode 100644
index 0000000..80f5094 100644
--- /dev/null
+++ a/scrape_ecourtindia_v6/transcribe.py
@@ -1,0 +1,102 @@
import os
import easyocr
import shutil
import csv
from pdf2image import convert_from_path
# import pytesseract
from concurrent.futures import ThreadPoolExecutor, as_completed

def read_csv_filenames(csv_path):
    filenames = set()
    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if len(row) >= 4:
                filename = row[4].strip()
                if filename.lower().endswith('.pdf'):
                    filenames.add(filename)
    return filenames

def process_pdf(pdf_path, output_folder, dpi=300, lang='hi'):
    reader = easyocr.Reader(['hi'], gpu=True)  # 'hi' is for Hindi
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    pdf_output_dir = os.path.join(output_folder, pdf_name)
    images_dir = os.path.join(pdf_output_dir, "images")

    os.makedirs(images_dir, exist_ok=True)

    try:
        images = convert_from_path(pdf_path, dpi=dpi)
        ocr_texts = []

        for i, image in enumerate(images):
            image_path = os.path.join(images_dir, f"page_{i+1}.png")
            image.save(image_path, "PNG")

            # GPU-accelerated OCR
            result = reader.readtext(image_path, detail=0)
            text = "\n".join(result)

            ocr_texts.append(f"--- Page {i+1} ---\n{text.strip()}\n")

        ocr_output_path = os.path.join(pdf_output_dir, "ocr_output.txt")
        with open(ocr_output_path, "w", encoding="utf-8") as f:
            f.write("\n".join(ocr_texts))

        print(f"✅ Processed with GPU: {pdf_path}{ocr_output_path}")
    except Exception as e:
        print(f"❌ Error processing {pdf_path}: {e}")

def collect_txt_files(base_output_folder, destination_folder):
    os.makedirs(destination_folder, exist_ok=True)
    for root, dirs, files in os.walk(base_output_folder):
        for file in files:
            if file == "ocr_output.txt":
                full_path = os.path.join(root, file)
                new_name = os.path.basename(os.path.dirname(full_path)) + ".txt"
                dest_path = os.path.join(destination_folder, new_name)
                shutil.copy(full_path, dest_path)
                print(f"📁 Copied: {full_path}{dest_path}")

def batch_process_folder(input_folder, output_folder, csv_path, dpi=300, lang='hi', max_threads=32):
    os.makedirs(output_folder, exist_ok=True)

    # Read allowed filenames from the CSV
    valid_filenames = read_csv_filenames(csv_path)

    # Only include matching PDF files
    pdf_files = [
        os.path.join(input_folder, filename)
        for filename in os.listdir(input_folder)
        if filename in valid_filenames
    ]

    print(f'number_of_files: {len(pdf_files)}')

    if not pdf_files:
        print("⚠️ No matching PDF files found in input folder.")
        return

    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = {
            executor.submit(process_pdf, pdf_path, output_folder, dpi, lang): pdf_path
            for pdf_path in pdf_files
        }

        for future in as_completed(futures):
            pdf_path = futures[future]
            try:
                future.result()
            except Exception as e:
                print(f"⚠️ Failed to process {pdf_path}: {e}")

    # collect_txt_files(output_folder, os.path.join(output_folder, "all_texts"))

# Set your actual folders and CSV path
input_folder = "pdf"
output_folder = "transcribed"
csv_path = "files.csv"

# Run batch processing with CSV filtering
# batch_process_folder(input_folder, output_folder, csv_path, lang='hin', max_threads=2)
collect_txt_files(output_folder, os.path.join(output_folder, "all_texts"))
diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py
deleted file mode 100644
index 485a4b8..0000000 100644
--- a/scrape_ecourtindia_v6/translate_to_english.py
+++ /dev/null
@@ -1,42 +1,0 @@
from tempfile import TemporaryDirectory
 
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

from tinydb import TinyDB
 
language = 'hin'
 
def to_english(input_file, output_file):
    image_file_list = []

    with TemporaryDirectory() as tempdir:
        pdf_pages = convert_from_path(input_file, 500)

        for page_enumeration, page in enumerate(pdf_pages, start=1):
            filename = f"{tempdir}/page_{page_enumeration}.jpg"
            page.save(filename, "JPEG")
            image_file_list.append(filename)
 
        with open(output_file, "a") as h:
            for image_file in image_file_list:
                text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
 
                # In many PDFs, at line ending, if a word can't
                # be written fully, a 'hyphen' is added.
                # The rest of the word is written in the next line
                # Eg: This is a sample text this word here GeeksF-
                # orGeeks is half on first line, remaining on next.
                # To remove this, we replace every '-\n' to ''.
                text = text.replace("-\n", "")

                breakpoint()
 
                h.write(text)

db = TinyDB('orders.json')
entries = db.all()

for entry in entries:
    to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')
diff --git a/scrape_ecourtindia_v6/modules/scraper_orders.py b/scrape_ecourtindia_v6/modules/scraper_orders.py
index d0b8df3..0a54a91 100644
--- a/scrape_ecourtindia_v6/modules/scraper_orders.py
+++ a/scrape_ecourtindia_v6/modules/scraper_orders.py
@@ -71,7 +71,7 @@
            self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ])
            i += 5

    def handle_orders(self, court_name):
    def handle_orders(self, court_name, district):
        for row in self.rows:
            order = row[0]

@@ -97,7 +97,8 @@
            except:
                print(f'UNABLE TO FETCH PDF: {pdf_url}')

            record = { 'court_name': court_name, 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename }
            record = { 'district': district, 'court_name': court_name, 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename }
            self.db.insert(record)

            sleep(0.7)
            self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()