🏡 index : github.com/compromyse/enfold.git

author Raghuram Subramani <raghus2247@gmail.com> 2025-03-25 20:33:06.0 +05:30:00
committer Raghuram Subramani <raghus2247@gmail.com> 2025-03-25 20:33:06.0 +05:30:00
commit
f362fbdbcf4da26ba7834c398398abbc1c7019df [patch]
tree
9d96ee78bffd984891ba5645691268e1793de6d9
parent
434252fa1831465b36e32206684e78cd698e8462
download
f362fbdbcf4da26ba7834c398398abbc1c7019df.tar.gz

retry captcha until it works :)



Diff

 scrape_ecourtindia_v6/main.py    |  1 -
 scrape_ecourtindia_v6/scraper.py | 80 +++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
 2 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/main.py
index 4794f8f..4a5c272 100644
--- a/scrape_ecourtindia_v6/main.py
+++ a/scrape_ecourtindia_v6/main.py
@@ -10,5 +10,4 @@
if __name__ == '__main__':
    m = Scraper(db)
    m.run()
    m.handle_views()
    m.driver.close()
diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py
index ebe559c..06c2cad 100644
--- a/scrape_ecourtindia_v6/scraper.py
+++ a/scrape_ecourtindia_v6/scraper.py
@@ -34,14 +34,15 @@
        self.close_modal()
        self.goto_acts()
        self.select_act()
        self.parse_table()
        self.handle_table()

    def close_modal(self):
        sleep(2)
        sleep(3)
        self.driver.execute_script('closeModel({modal_id:"validateError"})')
        sleep(1)

    def select(self, i_d, value):
        sleep(1)
        element = self.driver.find_element(By.ID, i_d)
        select = Select(element)
        select.select_by_value(value)
@@ -56,52 +57,63 @@
        self.submit_search()

    def goto_acts(self):
        self.select('sess_state_code', Karnataka)
        self.select('sess_dist_code', Bengaluru)
        self.select('court_complex_code', CMM_Court_Complex)
        while True:
            self.select('sess_state_code', Karnataka)
            self.select('sess_dist_code', Bengaluru)
            self.select('court_complex_code', CMM_Court_Complex)

        sleep(1)
            sleep(2)
            if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed():
                self.driver.execute_script('closeModel({modal_id:"validateError"})')
                continue

            break

        self.select('court_est_code', Chief_Metropolitan )

        sleep(1)
        element = self.driver.find_element(By.ID, 'act-tabMenu')
        element.click()
        sleep(1)

    def submit_search(self):
        sleep(2)
        img = self.driver.find_element(By.ID, 'captcha_image')
        temp = tempfile.NamedTemporaryFile(suffix='.png')
        img.screenshot(temp.name)
        captcha_incomplete = True
        while captcha_incomplete:
            sleep(2)
            img = self.driver.find_element(By.ID, 'captcha_image')
            temp = tempfile.NamedTemporaryFile(suffix='.png')
            img.screenshot(temp.name)

        img = cv2.imread(temp.name)
        text = pytesseract.image_to_string(img).strip()
            img = cv2.imread(temp.name)
            text = pytesseract.image_to_string(img).strip()

        element = self.driver.find_element(By.ID, 'act_captcha_code')
        element.send_keys(text)
            element = self.driver.find_element(By.ID, 'act_captcha_code')
            element.send_keys(text)

        self.driver.execute_script('submitAct()')
        sleep(3)
            self.driver.execute_script('submitAct()')
            sleep(3)

            if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed():
                self.driver.execute_script('closeModel({modal_id:"validateError"})')
                element.clear()
            else:
                captcha_incomplete = False

    def parse_table(self):
    def handle_table(self):
        table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
        rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
        self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
        self.views = []
        i = 5
        while i < len(rows):
            self.views.append(rows[i])
        while i < len(self.rows):
            view = self.rows[i]

            self.current_view = {
                'case_info': rows[i-2].get_text(strip=True),
                'petitioner_respondent': ' Vs '.join(rows[i-1].get_text(strip=True).split('Vs')),
                'case_info': self.rows[i-2].get_text(strip=True),
                'petitioner_respondent': ' Vs '.join(self.rows[i-1].get_text(strip=True).split('Vs')),
                'htmlfile': '',
                'pdfs': []
            }

            i += 4

    def handle_views(self):
        i = 0
        for view in self.views:
            script = view.find_all('a')[0].get_attribute_list('onclick')[0]
            self.driver.execute_script(script)
            sleep(1)
@@ -120,12 +132,9 @@
            self.parse_orders_table()

            self.db.insert(self.current_view)
            print(f'INSERTED: {self.current_view}')
            self.driver.find_element(By.ID, 'main_back_act').click()

            i += 1
            if i == 10:
                break

            i += 4

    def parse_orders_table(self):
        try:
@@ -160,7 +169,10 @@
            r = request.Request(pdf_url)
            r.add_header("Cookie", cookies)

            with request.urlopen(r) as response, open(filename, "wb") as file:
                file.write(response.read())
            try:
                with request.urlopen(r) as response, open(filename, "wb") as file:
                    file.write(response.read())
            except:
                print(f'UNABLE TO FETCH PDF: {pdf_url}')

            self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()