update
Diff
scrape_ecourtindia_v6/orders_scrape_courts.py | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
scrape_ecourtindia_v6/scrape_orders.py | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
scrape_ecourtindia_v6/modules/scraper_case_status.py | 2 +-
scrape_ecourtindia_v6/modules/scraper_orders.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 257 insertions(+), 74 deletions(-)
@@ -1,0 +1,130 @@
import csv
from time import sleep
from modules.scraper_orders import ScraperOrders
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
class ThreadSafeCSVWriter:
def __init__(self, filename):
self.file = open(filename, 'w', newline='')
self.writer = csv.writer(self.file)
self.lock = threading.Lock()
def writerow(self, row):
with self.lock:
self.writer.writerow(row)
print(f'Wrote: {row}')
def close(self):
self.file.close()
def scrape_district(state, district, csv_writer):
try:
config = {}
scraper = ScraperOrders(config)
scraper.close_modal()
scraper.select('sess_state_code', state)
scraper.select('sess_dist_code', district)
complexes = scraper.scrape_complexes()
scraper.select('court_complex_code', complexes[0])
sleep(2)
scraper.goto_courtnumber()
for cmplx in complexes:
while True:
sleep(0.5)
try:
modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed()
if modal_is_open:
scraper.close_modal()
continue
break
except:
break
scraper.select('court_complex_code', cmplx)
sleep(0.5)
court_numbers = scraper.get_court_numbers()
for court_number in court_numbers:
row = [state, district, cmplx, court_number]
csv_writer.writerow(row)
scraper.driver.quit()
except Exception as e:
print(f"Error scraping district {district}: {e}")
def scrape_courts():
state = 'Uttar Pradesh'
config = {}
scraper = ScraperOrders(config)
scraper.close_modal()
scraper.select('sess_state_code', state)
districts = scraper.scrape_districts()
scraper.driver.quit()
csv_writer = ThreadSafeCSVWriter('csv/court_numbers.csv')
csv_writer.writerow(['State', 'District', 'Cmplx', 'Court number'])
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
executor.submit(scrape_district, state, district, csv_writer)
for district in districts
]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
print(f"A thread encountered an error: {e}")
csv_writer.close()
def scrape_orders(courts):
csvfile = open(courts, newline='')
reader = csv.reader(csvfile)
for row in reader:
print(row)
config = {}
scraper = ScraperOrders(config)
scraper.close_modal()
scraper.select('sess_state_code', row[0])
scraper.select('sess_dist_code', row[1])
while True:
sleep(0.5)
try:
modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed()
if modal_is_open:
scraper.close_modal()
continue
break
except:
break
scraper.select('court_complex_code', row[2])
sleep(1)
scraper.goto_courtnumber()
scraper.select('nnjudgecode1', row[3])
scraper.driver.find_element(By.ID, 'radBoth2').click()
scraper.submit_search()
scraper.parse_orders_table()
scraper.handle_orders()
break
csvfile.close()
if __name__ == '__main__':
scrape_orders('csv/2023-24_pocso.csv')
@@ -1,82 +1,71 @@
import csv
from time import sleep
from tinydb import TinyDB
from modules.scraper_orders import ScraperOrders
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
class ThreadSafeCSVWriter:
def __init__(self, filename):
self.file = open(filename, 'w', newline='')
self.writer = csv.writer(self.file)
class ThreadSafeDB:
def __init__(self):
self.db = TinyDB('orders.json')
self.lock = threading.Lock()
def writerow(self, row):
def insert(self, record):
with self.lock:
self.writer.writerow(row)
print(f'Wrote: {row}')
self.db.insert(record)
print(f'INSERTED: {record}')
def close(self):
self.file.close()
db = ThreadSafeDB()
def scrape_district(state, district, csv_writer):
def scrape_single_court(row):
try:
config = {}
scraper = ScraperOrders(config)
scraper = ScraperOrders(db, config)
scraper.close_modal()
scraper.select('sess_state_code', state)
scraper.select('sess_dist_code', district)
complexes = scraper.scrape_complexes()
scraper.select('court_complex_code', complexes[0])
sleep(2)
scraper.goto_courtnumber()
for cmplx in complexes:
while True:
sleep(0.5)
try:
modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed()
if modal_is_open:
scraper.close_modal()
continue
break
except:
break
scraper.select('court_complex_code', cmplx)
scraper.select('sess_state_code', row[0])
scraper.select('sess_dist_code', row[1])
while True:
sleep(0.5)
court_numbers = scraper.get_court_numbers()
for court_number in court_numbers:
row = [state, district, cmplx, court_number]
csv_writer.writerow(row)
try:
modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed()
if modal_is_open:
scraper.close_modal()
continue
break
except:
break
scraper.select('court_complex_code', row[2])
sleep(1)
scraper.goto_courtnumber()
scraper.select('nnjudgecode1', row[3])
scraper.driver.find_element(By.ID, 'radBoth2').click()
scraper.submit_search()
scraper.parse_orders_table()
scraper.handle_orders()
scraper.driver.quit()
except Exception as e:
print(f"Error scraping district {district}: {e}")
def scrape_courts():
state = 'Uttar Pradesh'
config = {}
scraper = ScraperOrders(config)
scraper.close_modal()
scraper.select('sess_state_code', state)
print(f"Error processing court {row}: {e}")
def scrape_orders(courts_csv):
with open(courts_csv, newline='') as csvfile:
reader = csv.reader(csvfile)
next(reader, None)
courts = list(reader)
districts = scraper.scrape_districts()
scraper.driver.quit()
csv_writer = ThreadSafeCSVWriter('csv/court_numbers.csv')
csv_writer.writerow(['State', 'District', 'Cmplx', 'Court number'])
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
executor.submit(scrape_district, state, district, csv_writer)
for district in districts
executor.submit(scrape_single_court, court)
for court in courts
]
for future in as_completed(futures):
@@ -85,16 +74,6 @@
except Exception as e:
print(f"A thread encountered an error: {e}")
csv_writer.close()
def scrape_orders(courts):
csvfile = open(courts, newline='')
reader = csv.reader(csvfile)
for row in reader:
print(row)
csvfile.close()
if __name__ == '__main__':
scrape_orders('csv/2023-24_pocso.csv')
input_file = 'csv/2023-24_pocso.csv'
scrape_orders(input_file)
@@ -134,7 +134,7 @@
script = order.find_all('a')[0].get_attribute_list('onclick')[0]
self.driver.execute_script(script)
sleep(2)
sleep(0.7)
obj = self.driver.find_element(By.TAG_NAME, 'object')
pdf_url = str(obj.get_attribute('data'))
@@ -1,17 +1,25 @@
from time import sleep
import tempfile
import uuid
import os
from urllib import request
from bs4 import BeautifulSoup
import cv2
import pytesseract
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from tinydb import TinyDB
from .scraper import Scraper
class ScraperOrders(Scraper):
def __init__(self, config):
Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index')
def __init__(self, db, config):
Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index', headless=True)
self.db = TinyDB('db.json')
self.db = db
self.config = config
def goto_courtnumber(self):
@@ -27,3 +35,69 @@
print(f'COURT NUMBERS: {court_numbers}')
return court_numbers
def submit_search(self):
captcha_incomplete = True
while captcha_incomplete:
img = self.driver.find_element(By.ID, 'captcha_image')
temp = tempfile.NamedTemporaryFile(suffix='.png')
img.screenshot(temp.name)
img = cv2.imread(temp.name)
text = pytesseract.image_to_string(img).strip()
element = self.driver.find_element(By.ID, 'order_no_captcha_code')
element.send_keys(text)
self.driver.execute_script('submitCourtNumber()')
sleep(3)
if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed():
self.close_modal()
element.clear()
else:
captcha_incomplete = False
def parse_orders_table(self):
try:
table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
except:
return
rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
self.rows = []
i = 6
while i < len(rows):
self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ])
i += 5
def handle_orders(self):
for row in self.rows:
order = row[0]
script = order.find_all('a')[0].get_attribute_list('onclick')[0]
self.driver.execute_script(script)
sleep(0.7)
obj = self.driver.find_elements(By.TAG_NAME, 'object')[-1]
pdf_url = str(obj.get_attribute('data'))
while True:
filename = f"pdf/{uuid.uuid4().hex}.pdf"
if not os.path.exists(filename):
break
cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()])
r = request.Request(pdf_url)
r.add_header("Cookie", cookies)
try:
with request.urlopen(r) as response, open(filename, "wb") as file:
file.write(response.read())
except:
print(f'UNABLE TO FETCH PDF: {pdf_url}')
record = { 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename }
self.db.insert(record)
self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()