From 250d8c18777c7e0ba87ec4334d3b316d4e6bd0dd Mon Sep 17 00:00:00 2001 From: Raghuram Subramani <raghus2247@gmail.com> Date: Mon, 12 May 2025 11:00:36 +0530 Subject: [PATCH] update --- web/app/.gitignore | 1 + web/app/job_manager.py | 5 +---- web/app/main.py | 14 +++++++------- web/app/jobs/scrape_cases.py | 237 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------- web/app/templates/home.html | 7 +++++++ 5 files changed, 125 insertions(+), 139 deletions(-) diff --git a/web/app/.gitignore b/web/app/.gitignore new file mode 100644 index 0000000..17aa483 100644 --- /dev/null +++ a/web/app/.gitignore @@ -1,0 +1,1 @@ +outputs/ diff --git a/web/app/job_manager.py b/web/app/job_manager.py index bdbe994..3d588a6 100644 --- a/web/app/job_manager.py +++ a/web/app/job_manager.py @@ -25,7 +25,4 @@ started_job_ids = self.q.started_job_registry.get_job_ids() started_jobs = [Job.fetch(job_id, connection=self.redis) for job_id in started_job_ids] - finished_job_ids = self.q.finished_job_registry.get_job_ids() - finished_jobs = [Job.fetch(job_id, connection=self.redis) for job_id in finished_job_ids] - - return queued_jobs + started_jobs + finished_jobs + return queued_jobs + started_jobs diff --git a/web/app/main.py b/web/app/main.py index cc141b8..e834c0c 100644 --- a/web/app/main.py +++ a/web/app/main.py @@ -1,6 +1,7 @@ from flask import request, flash, send_from_directory from flask import Blueprint, render_template, redirect, url_for from flask_login import login_required, logout_user, current_user +from tinydb import TinyDB from .models import User import json @@ -9,8 +10,6 @@ from .modules.interface import Interface from .job_manager import JobManager - -from tinydb import TinyDB states = Interface().get_states() act_list = json.loads(open('app/acts.json').read()) @@ -22,7 +21,8 @@ @login_required def home(): jobs = job_manager.get_jobs() - return render_template('home.html', user=current_user, states=states, acts=act_list, jobs=jobs) + completed_jobs = TinyDB('jobs.json').all() + return render_template('home.html', user=current_user, states=states, acts=act_list, completed_jobs=completed_jobs, jobs=jobs) @main.route('/logout') @login_required @@ -58,14 +58,14 @@ @login_required def enqueue_job(): acts = request.form.getlist('act') - sections = request.form.get('section').split(',') + sections = request.form.get('section', '').split(',') state_code = request.form.get('state_code') name = request.form.get('name') - if not section: - section = '' + if not sections: + sections = '' - job = job_manager.enqueue_scrape(f'{name} - {time.time_ns()}', acts, sections, state_code) + job_manager.enqueue_scrape(f'{name} - {time.time_ns()}', acts, sections, state_code) flash('Job created.', 'info') return redirect(url_for('main.home')) diff --git a/web/app/jobs/scrape_cases.py b/web/app/jobs/scrape_cases.py index 7a944f1..237acbc 100644 --- a/web/app/jobs/scrape_cases.py +++ a/web/app/jobs/scrape_cases.py @@ -1,160 +1,141 @@ from app.modules.interface import Interface -from tinydb import TinyDB from bs4 import BeautifulSoup -import time import csv -def scrape_cases(name, acts, sections, state_code): - acts = set(acts) - db = TinyDB(f'app/outputs/{name}.json') - interface = Interface() +from tinydb import TinyDB + +db = TinyDB('app/jobs.json') +def get_districts(interface, state_code): try: - districts = interface.get_districts(state_code) + return interface.get_districts(state_code) except Exception as e: print(f"[ERROR] Failed to scrape districts: {e}") - districts = [] + return [] + +def get_complexes(interface, state_code, dist_code, dist_name): + try: + return interface.get_complexes(state_code, dist_code) + except Exception as e: + print(f"[ERROR] Failed to scrape complexes for {dist_name}: {e}") + return [] + +def fetch_cases(interface, state_code, dist_code, court_establishment, act, section, complex_name): + try: + return interface.search_by_act(state_code, dist_code, court_establishment, act, section) + except Exception as e: + print(f"[ERROR] Failed to scrape cases in complex {complex_name}: {e}") + return [] + +def fetch_case_history(interface, state_code, dist_code, court_establishment, case_no): + try: + return interface.case_history(state_code, dist_code, court_establishment, case_no) + except Exception as e: + print(f"[ERROR] Failed to get history for case {case_no}: {e}") + return None + +def parse_orders(order_html): + soup = BeautifulSoup(order_html or '', features="html.parser") + orders = [] + for row in soup.select('table.tbl-result tbody tr'): + cells = row.find_all('td') + if len(cells) >= 2: + order_date = cells[1].get_text(strip=True) + link_tag = cells[2].find('a', href=True) if len(cells) > 2 else None + if link_tag: + orders.append({'date': order_date, 'link': link_tag['href']}) + return orders + +def parse_acts(entry, all_acts): + soup = BeautifulSoup(entry.get('act', ''), 'html.parser') + acts = [] + for row in soup.select('tbody tr'): + cells = row.find_all('td') + if len(cells) == 2: + act = cells[0].get_text(strip=True) + section = cells[1].get_text(strip=True) + if act not in all_acts: + all_acts.append(act) + acts.append(f"{act}: {section}") + return '\n'.join(acts) + +def write_to_csv(entries, key_mapping, name): + max_final = max(len(entry.get('final_orders', [])) for entry in entries) + max_interim = max(len(entry.get('interim_orders', [])) for entry in entries) + + with open(f'app/outputs/{name}.csv', 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + headers = list(key_mapping.values()) + \ + [f'Final Order {i+1}' for i in range(max_final)] + \ + [f'Interim Order {i+1}' for i in range(max_interim)] + writer.writerow(headers) + + for entry in entries: + row = [entry.get(key, '') for key in key_mapping] + + for order in entry.get('final_orders', []): + row.append(f'=HYPERLINK("{order["link"]}", "{order["date"]}")') + row += [''] * (max_final - len(entry.get('final_orders', []))) + + for order in entry.get('interim_orders', []): + row.append(f'=HYPERLINK("{order["link"]}", "{order["date"]}")') + row += [''] * (max_interim - len(entry.get('interim_orders', []))) + writer.writerow(row) + +def scrape_cases(name, acts, sections, state_code): + acts = set(acts) + entries = [] + interface = Interface() + + districts = get_districts(interface, state_code) for dist_code, dist_name in districts: print(f'DISTRICT: {dist_name}') + complexes = get_complexes(interface, state_code, dist_code, dist_name) - try: - complexes = interface.get_complexes(state_code, dist_code) - except Exception as e: - print(f"[ERROR] Failed to scrape complexes for {dist_name}: {e}") - continue - for complex_code, complex_name in complexes: print(f'COMPLEX: {complex_name}') - court_establishments = str(complex_code).split(',') + for i, court_establishment in enumerate(court_establishments, 1): print(f'ESTABLISHMENT: {i}/{len(court_establishments)}') for act in acts: for section in sections: - try: - cases = interface.search_by_act(state_code, dist_code, court_establishment, act, section) - except Exception as e: - print(f"[ERROR] Failed to scrape cases in complex {complex_name}: {e}") - continue + cases = fetch_cases(interface, state_code, dist_code, court_establishment, act, section, complex_name) for j, case in enumerate(cases, 1): print(f'CASE: {j}/{len(cases)}') - - try: - case_no = case['case_no'] - case_history = interface.case_history(state_code, dist_code, court_establishment, case_no) - except Exception as e: - print(f"[ERROR] Failed to get history for case {case.get('case_no', 'UNKNOWN')}: {e}") + case_no = case.get('case_no') + if not case_no: continue - try: - case_history['case_no'] = case_no - case_history['complex_name'] = complex_name - db.insert(case_history) + case_history = fetch_case_history(interface, state_code, dist_code, court_establishment, case_no) + if not case_history: + continue - except Exception as e: - print(f"[ERROR] Failed to parse orders for case {case_no}: {e}") - - entries = db.all() + case_history['case_no'] = case_no + case_history['complex_name'] = complex_name + entries.append(case_history) key_mapping = { - 'case_no': 'Case Number', - 'cino': 'CNR Number', - 'type_name': 'Case Type', - - 'reg_no': 'Registration Number', - 'reg_year': 'Registration Year', - - 'district_name': 'District', - 'complex_name': 'Complex Name', - 'court_name': 'Court Name', - - 'dt_regis': 'Registration Date', - 'date_of_filing': 'Date of Filing', - 'date_of_decision': 'Date of Decision', - 'disp_name': 'Disposition', - - 'acts': 'Acts', - - 'pet_name': 'Petitioner', - 'pet_adv': 'Petitioner Advocate', - 'petparty_name': 'Petitioner Party Name', - - 'res_name': 'Respondent', - 'res_adv': 'Respondent Advocate', - 'resparty_name': 'Respondent Party Name' + 'case_no': 'Case Number', 'cino': 'CNR Number', 'type_name': 'Case Type', + 'reg_no': 'Registration Number', 'reg_year': 'Registration Year', + 'district_name': 'District', 'complex_name': 'Complex Name', 'court_name': 'Court Name', + 'dt_regis': 'Registration Date', 'date_of_filing': 'Date of Filing', 'date_of_decision': 'Date of Decision', + 'disp_name': 'Disposition', 'acts': 'Acts', + 'pet_name': 'Petitioner', 'pet_adv': 'Petitioner Advocate', 'petparty_name': 'Petitioner Party Name', + 'res_name': 'Respondent', 'res_adv': 'Respondent Advocate', 'resparty_name': 'Respondent Party Name' } all_acts = [] - for entry in entries: - soup = BeautifulSoup(entry.get('finalOrder') or '', features="html.parser") - final_orders = [] - for row in soup.select('table.tbl-result tbody tr'): - cells = row.find_all('td') - if len(cells) >= 2: - order_date = cells[1].get_text(strip=True) - link_tag = cells[2].find('a', href=True) if len(cells) > 2 else None - if link_tag: - final_orders.append({'date': order_date, 'link': link_tag['href']}) - - soup = BeautifulSoup(entry.get('interimOrder') or '', features="html.parser") - interim_orders = [] - for row in soup.select('table.tbl-result tbody tr'): - cells = row.find_all('td') - if len(cells) >= 2: - order_date = cells[1].get_text(strip=True) - link_tag = cells[2].find('a', href=True) if len(cells) > 2 else None - if link_tag: - interim_orders.append({'date': order_date, 'link': link_tag['href']}) - - act_html = entry.get('act', '') - soup = BeautifulSoup(act_html, 'html.parser') - - acts = [] - for row in soup.select('tbody tr'): - cells = row.find_all('td') - if len(cells) == 2: - act = cells[0].get_text(strip=True) - section = cells[1].get_text(strip=True) - if act not in all_acts: - all_acts.append(act) - - acts.append(f"{act}: {section}") - - entry['acts'] = '\n'.join(acts) - entry['final_orders'] = final_orders - entry['interim_orders'] = interim_orders - - max_final = max(len(entry.get('final_orders', [])) for entry in entries) - max_interim = max(len(entry.get('interim_orders', [])) for entry in entries) - - with open(f'app/outputs/{name}.csv', 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - - headers = list(key_mapping.values()) - - headers += [f'Final Order {i+1}' for i in range(max_final)] - headers += [f'Interim Order {i+1}' for i in range(max_interim)] - writer.writerow(headers) - - for entry in entries: - row = [] - for key in key_mapping: - row.append(entry.get(key, '')) - - final_orders = entry.get('final_orders', []) - for order in final_orders: - hyperlink = f'=HYPERLINK("{order["link"]}", "{order["date"]}")' - row.append(hyperlink) - row += [''] * (max_final - len(final_orders)) - - interim_orders = entry.get('interim_orders', []) - for order in interim_orders: - hyperlink = f'=HYPERLINK("{order["link"]}", "{order["date"]}")' - row.append(hyperlink) - row += [''] * (max_interim - len(interim_orders)) - - writer.writerow(row) + entry['final_orders'] = parse_orders(entry.get('finalOrder')) + entry['interim_orders'] = parse_orders(entry.get('interimOrder')) + entry['acts'] = parse_acts(entry, all_acts) + + write_to_csv(entries, key_mapping, name) + + db.insert({ + "name": name + }) diff --git a/web/app/templates/home.html b/web/app/templates/home.html index 797c66d..7caff64 100644 --- a/web/app/templates/home.html +++ a/web/app/templates/home.html @@ -62,6 +62,13 @@ </tr> </thead> <tbody> + {% for job in completed_jobs %} + <tr> + <td>{{ job['name'] }}</td> + <td>COMPLETED</td> + <td><a href="{{ url_for('main.download_output', filename=job['name']) }}">Download</a></td> + </tr> + {% endfor %} {% for job in jobs %} <tr> <td>{{ job.args[0] }}</td> -- rgit 0.1.5