import yaml import json import re import datetime from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter from urllib3.util import Retry from urllib.request import HTTPRedirectHandler from requests import Session import urllib.error class NoRedirect(HTTPRedirectHandler): def redirect_request(self, req, fp, code, msg, headers, newurl): return None DATE_REGEX = r'(?:(?P\d{4})-)?(?Pjanuary|february|march|april|may|june|july|august|september|october|november|december)-(?P\d+)(?:-(?P\d{4})-)?' def parse_redirect(kb_id, slug): m =, slug) if m == None: return None else: y ='year') or'year2') d = int('date')) if y == None: # if date actually contains the year # set date to 1, and set year to date if d>1900 and d<2030: y = d d = 1 else: return None if d>30: return None date_s = f"{d} {'month').title()} {y}" date = datetime.datetime.strptime(date_s, "%d %B %Y").strftime("%Y-%m-%d") return { "date": date, "uuid": slug[-36:], "slug": slug, "url": f"{kb_id}" } def get_url_slug(session, kb_id): url = f"{kb_id}" response = session.head(url, allow_redirects=False, timeout=5) if 'location' in response.headers: l = response.headers['location'] return l.split('/')[-1] def update_mapping(session, kb_ids): print(f"Total Count: {len(kb_ids)}") kb = None updated = False with open('data.json', 'r') as f: kb = json.load(f) i = 0 for kb_id in kb_ids: i=i+1 if kb_id not in kb: slug = get_url_slug(session, kb_id) if slug: new_data = parse_redirect(kb_id, slug) if new_data: updated = True kb[kb_id] = new_data print(f"Status: {i}/{len(kb_ids)}") if updated: with open('data.json', 'w') as f: f.write(json.dumps(kb, indent=2)) def fetch_kb_mentions(session, url): with session.get(url, timeout=10) as response: print(url) soup = BeautifulSoup(response.text, features="html5lib") for a in soup.find('div', class_='content').find_all('a', href=True): l = a['href'] if l.startswith('') or l.startswith(''): yield l.split('/')[4] if __name__ == "__main__": kbs = [] s = Session() retries = Retry( total=3, backoff_factor=0.1, status_forcelist=[502, 503, 504], allowed_methods={'GET'}, ) s.mount('https://', HTTPAdapter(max_retries=retries)) with open('discovery.txt', 'r') as f: for url in f.readlines(): url = url.strip() for kb_id in fetch_kb_mentions(s, url): kbs.append(kb_id) update_mapping(s, kbs)