From 959c5e6eaf0b5879c8277fe58685d28ec94f98d0 Mon Sep 17 00:00:00 2001 From: Raghuram Subramani <raghus2247@gmail.com> Date: Thu, 27 Mar 2025 23:20:24 +0530 Subject: [PATCH] clean --- scrape_ecourtindia_v6/.gitignore | 1 + scrape_ecourtindia_v6/clean.sh | 2 +- scrape_ecourtindia_v6/requirements.txt | 3 --- scrape_ecourtindia_v6/run.sh | 3 --- scrape_ecourtindia_v6/scrape_orders.py | 4 ++-- scrape_ecourtindia_v6/web.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- scrape_ecourtindia_v6/templates/index.html | 40 ---------------------------------------- 7 files changed, 71 insertions(+), 64 deletions(-) diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore index 62236f3..f390c7e 100644 --- a/scrape_ecourtindia_v6/.gitignore +++ a/scrape_ecourtindia_v6/.gitignore @@ -1,2 +1,3 @@ courts.csv csv/* +orders.json diff --git a/scrape_ecourtindia_v6/clean.sh b/scrape_ecourtindia_v6/clean.sh index 8c8a0ab..a38f202 100755 --- a/scrape_ecourtindia_v6/clean.sh +++ a/scrape_ecourtindia_v6/clean.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash -rm -r html/* pdf/* db.json +rm -rf html/* pdf/* *.json mkdir -p html pdf diff --git a/scrape_ecourtindia_v6/requirements.txt b/scrape_ecourtindia_v6/requirements.txt deleted file mode 100644 index 78bea83..0000000 100644 --- a/scrape_ecourtindia_v6/requirements.txt +++ /dev/null @@ -1,3 +1,0 @@ -selenium -opencv-python -pytesseract diff --git a/scrape_ecourtindia_v6/run.sh b/scrape_ecourtindia_v6/run.sh deleted file mode 100644 index de47eaf..0000000 100644 --- a/scrape_ecourtindia_v6/run.sh +++ /dev/null @@ -1,3 +1,0 @@ -#!/usr/bin/env bash - -uvicorn web:app --reload diff --git a/scrape_ecourtindia_v6/scrape_orders.py b/scrape_ecourtindia_v6/scrape_orders.py index b0dc311..146119e 100644 --- a/scrape_ecourtindia_v6/scrape_orders.py +++ a/scrape_ecourtindia_v6/scrape_orders.py @@ -43,8 +43,9 @@ scraper.select('court_complex_code', row[2]) sleep(1) scraper.goto_courtnumber() - sleep(0.6) + sleep(1) scraper.select('nnjudgecode1', row[3]) + sleep(1) scraper.driver.find_element(By.ID, 'radBoth2').click() @@ -60,7 +61,6 @@ def scrape_orders(courts_csv): with open(courts_csv, newline='') as csvfile: reader = csv.reader(csvfile) - next(reader, None) courts = list(reader) with ThreadPoolExecutor(max_workers=5) as executor: diff --git a/scrape_ecourtindia_v6/web.py b/scrape_ecourtindia_v6/web.py index 195b81f..a0bf0b0 100644 --- a/scrape_ecourtindia_v6/web.py +++ a/scrape_ecourtindia_v6/web.py @@ -1,20 +1,72 @@ -from tinydb import TinyDB +import os +from flask import Flask, send_from_directory, abort -from fastapi import FastAPI, Request -from fastapi.responses import HTMLResponse -from fastapi.staticfiles import StaticFiles -from fastapi.templating import Jinja2Templates +app = Flask(__name__) -db = TinyDB('db.json') -app = FastAPI() +# Directory where PDFs are stored +PDF_DIRECTORY = './pdf' -app.mount("/html", StaticFiles(directory="html"), name="html") -app.mount("/pdf", StaticFiles(directory="pdf"), name="pdf") +@app.route('/pdf/<filename>') +def view_pdf(filename): + """ + Route to view a PDF file from the specified directory. + + Args: + filename (str): Name of the PDF file to display + + Returns: + PDF file or 404 error if file doesn't exist + """ + try: + # Ensure the filename is safe and exists + if not filename.endswith('.pdf'): + abort(400, description="Invalid file type. Only PDF files are allowed.") + + # Check if the file exists in the PDF directory + filepath = os.path.join(PDF_DIRECTORY, filename) + if not os.path.exists(filepath): + abort(404, description="PDF file not found") + + # Send the PDF file + return send_from_directory(PDF_DIRECTORY, filename, as_attachment=False) + + except Exception as e: + abort(500, description=f"Internal server error: {str(e)}") -templates = Jinja2Templates(directory="templates") +@app.route('/pdf') +def list_pdfs(): + """ + Route to list all available PDF files in the directory. + + Returns: + HTML page with list of PDFs or error message + """ + try: + # Get list of PDF files in the directory + pdf_files = [f for f in os.listdir(PDF_DIRECTORY) if f.endswith('.pdf')] + + # Create a simple HTML response with links to PDFs + pdf_links = "\n".join([ + f'<li><a href="/pdf/{file}">{file}</a></li>' + for file in pdf_files + ]) + + return f""" + <html> + <head><title>PDF Viewer</title></head> + <body> + <h1>Available PDFs</h1> + <ul>{pdf_links}</ul> + </body> + </html> + """ + + except Exception as e: + abort(500, description=f"Error listing PDFs: {str(e)}") -@app.get("/", response_class=HTMLResponse) -async def index(request: Request): - return templates.TemplateResponse( - request=request, name="index.html", context={ 'views': db.all() } - ) +if __name__ == '__main__': + # Ensure PDF directory exists + os.makedirs(PDF_DIRECTORY, exist_ok=True) + + # Run the Flask app + app.run(host='0.0.0.0', port=8000, debug=True) diff --git a/scrape_ecourtindia_v6/templates/index.html b/scrape_ecourtindia_v6/templates/index.html deleted file mode 100644 index 0b01b77..0000000 100644 --- a/scrape_ecourtindia_v6/templates/index.html +++ /dev/null @@ -1,40 +1,0 @@ -<html> -<head> - <title>Index</title> - <link - rel="stylesheet" - href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.min.css" - > - <meta charset="utf-8"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <meta name="color-scheme" content="light dark"> -</head> -<body> - <main class="container"> - <table> - <thead> - <tr> - <th scope="col">Case Info</th> - <th scope="col">Petitioner/Respondent</th> - <th scope="col">HTML File</th> - <th scope="col">Orders</th> - </tr> - </thead> - <tbody> - {% for view in views %} - <tr> - <th scope="row">{{ view.case_info }}</th> - <td>{{ view.petitioner_respondent }}</td> - <td><a href='{{ view.htmlfile }}'>Open</a></td> - <td> - {% for pdf in view.pdfs %} - <a href='{{ pdf }}'>Open</a> - {% endfor %} - </td> - </tr> - {% endfor %} - </tbody> - </table> - </main> -</body> -</html> -- rgit 0.1.5