From f362fbdbcf4da26ba7834c398398abbc1c7019df Mon Sep 17 00:00:00 2001 From: Raghuram Subramani <raghus2247@gmail.com> Date: Tue, 25 Mar 2025 20:33:06 +0530 Subject: [PATCH] retry captcha until it works :) --- scrape_ecourtindia_v6/main.py | 1 - scrape_ecourtindia_v6/scraper.py | 80 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------- 2 files changed, 46 insertions(+), 35 deletions(-) diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/main.py index 4794f8f..4a5c272 100644 --- a/scrape_ecourtindia_v6/main.py +++ a/scrape_ecourtindia_v6/main.py @@ -10,5 +10,4 @@ if __name__ == '__main__': m = Scraper(db) m.run() - m.handle_views() m.driver.close() diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py index ebe559c..06c2cad 100644 --- a/scrape_ecourtindia_v6/scraper.py +++ a/scrape_ecourtindia_v6/scraper.py @@ -34,14 +34,15 @@ self.close_modal() self.goto_acts() self.select_act() - self.parse_table() + self.handle_table() def close_modal(self): - sleep(2) + sleep(3) self.driver.execute_script('closeModel({modal_id:"validateError"})') sleep(1) def select(self, i_d, value): + sleep(1) element = self.driver.find_element(By.ID, i_d) select = Select(element) select.select_by_value(value) @@ -56,52 +57,63 @@ self.submit_search() def goto_acts(self): - self.select('sess_state_code', Karnataka) - self.select('sess_dist_code', Bengaluru) - self.select('court_complex_code', CMM_Court_Complex) + while True: + self.select('sess_state_code', Karnataka) + self.select('sess_dist_code', Bengaluru) + self.select('court_complex_code', CMM_Court_Complex) - sleep(1) + sleep(2) + if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed(): + self.driver.execute_script('closeModel({modal_id:"validateError"})') + continue + + break + self.select('court_est_code', Chief_Metropolitan ) + sleep(1) element = self.driver.find_element(By.ID, 'act-tabMenu') element.click() sleep(1) def submit_search(self): - sleep(2) - img = self.driver.find_element(By.ID, 'captcha_image') - temp = tempfile.NamedTemporaryFile(suffix='.png') - img.screenshot(temp.name) + captcha_incomplete = True + while captcha_incomplete: + sleep(2) + img = self.driver.find_element(By.ID, 'captcha_image') + temp = tempfile.NamedTemporaryFile(suffix='.png') + img.screenshot(temp.name) - img = cv2.imread(temp.name) - text = pytesseract.image_to_string(img).strip() + img = cv2.imread(temp.name) + text = pytesseract.image_to_string(img).strip() - element = self.driver.find_element(By.ID, 'act_captcha_code') - element.send_keys(text) + element = self.driver.find_element(By.ID, 'act_captcha_code') + element.send_keys(text) - self.driver.execute_script('submitAct()') - sleep(3) + self.driver.execute_script('submitAct()') + sleep(3) + if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed(): + self.driver.execute_script('closeModel({modal_id:"validateError"})') + element.clear() + else: + captcha_incomplete = False - def parse_table(self): + def handle_table(self): table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') - rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') + self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') self.views = [] i = 5 - while i < len(rows): - self.views.append(rows[i]) + while i < len(self.rows): + view = self.rows[i] + self.current_view = { - 'case_info': rows[i-2].get_text(strip=True), - 'petitioner_respondent': ' Vs '.join(rows[i-1].get_text(strip=True).split('Vs')), + 'case_info': self.rows[i-2].get_text(strip=True), + 'petitioner_respondent': ' Vs '.join(self.rows[i-1].get_text(strip=True).split('Vs')), 'htmlfile': '', 'pdfs': [] } - i += 4 - - def handle_views(self): - i = 0 - for view in self.views: script = view.find_all('a')[0].get_attribute_list('onclick')[0] self.driver.execute_script(script) sleep(1) @@ -120,12 +132,9 @@ self.parse_orders_table() self.db.insert(self.current_view) + print(f'INSERTED: {self.current_view}') self.driver.find_element(By.ID, 'main_back_act').click() - - i += 1 - if i == 10: - break - + i += 4 def parse_orders_table(self): try: @@ -160,7 +169,10 @@ r = request.Request(pdf_url) r.add_header("Cookie", cookies) - with request.urlopen(r) as response, open(filename, "wb") as file: - file.write(response.read()) + try: + with request.urlopen(r) as response, open(filename, "wb") as file: + file.write(response.read()) + except: + print(f'UNABLE TO FETCH PDF: {pdf_url}') self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() -- rgit 0.1.5