from time import sleep import tempfile import uuid import os from urllib import request from bs4 import BeautifulSoup import cv2 import pytesseract from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select from .scraper import Scraper class ScraperOrders(Scraper): def __init__(self, db, config): Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index', headless=True) self.db = db self.config = config def goto_courtnumber(self): element = self.driver.find_element(By.ID, 'courtnumber-tabMenu') element.click() sleep(1) def get_court_numbers(self): element = self.driver.find_element(By.ID, 'nnjudgecode1') select = Select(element) options = select.options court_numbers = [ option.text for option in options ] print(f'COURT NUMBERS: {court_numbers}') return court_numbers def submit_search(self): captcha_incomplete = True while captcha_incomplete: img = self.driver.find_element(By.ID, 'captcha_image') temp = tempfile.NamedTemporaryFile(suffix='.png') img.screenshot(temp.name) img = cv2.imread(temp.name) text = pytesseract.image_to_string(img).strip() element = self.driver.find_element(By.ID, 'order_no_captcha_code') element.send_keys(text) self.driver.execute_script('submitCourtNumber()') sleep(3) if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed(): self.close_modal() element.clear() else: captcha_incomplete = False def parse_orders_table(self): try: table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') except: return rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') self.rows = [] i = 6 while i < len(rows): self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ]) i += 5 def handle_orders(self, court_name, district): for row in self.rows: order = row[0] script = order.find_all('a')[0].get_attribute_list('onclick')[0] self.driver.execute_script(script) sleep(0.7) obj = self.driver.find_elements(By.TAG_NAME, 'object')[-1] pdf_url = str(obj.get_attribute('data')) while True: filename = f"pdf/{uuid.uuid4().hex}.pdf" if not os.path.exists(filename): break cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()]) r = request.Request(pdf_url) r.add_header("Cookie", cookies) try: with request.urlopen(r) as response, open(filename, "wb") as file: file.write(response.read()) except: print(f'UNABLE TO FETCH PDF: {pdf_url}') record = { 'district': district, 'court_name': court_name, 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename } self.db.insert(record) sleep(0.7) self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()