microsoft-kb-metadata/update.py
2024-07-04 10:44:29 +05:30

101 lines
3.1 KiB
Python

import yaml
import json
import re
import datetime
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from urllib.request import HTTPRedirectHandler
from requests import Session
import urllib.error
class NoRedirect(HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl):
return None
DATE_REGEX = r'(?:(?P<year>\d{4})-)?(?P<month>january|february|march|april|may|june|july|august|september|october|november|december)-(?P<date>\d+)(?:-(?P<year2>\d{4})-)?'
def parse_redirect(kb_id, slug):
m = re.search(DATE_REGEX, slug)
if m == None:
return None
else:
y = m.group('year') or m.group('year2')
d = int(m.group('date'))
if y == None:
# if date actually contains the year
# set date to 1, and set year to date
if d>1900 and d<2030:
y = d
d = 1
else:
return None
if d>30:
return None
date_s = f"{d} {m.group('month').title()} {y}"
date = datetime.datetime.strptime(date_s, "%d %B %Y").strftime("%Y-%m-%d")
return {
"date": date,
"uuid": slug[-36:],
"slug": slug,
"url": f"https://support.microsoft.com/help/{kb_id}"
}
def get_url_slug(session, kb_id):
url = f"https://support.microsoft.com/help/{kb_id}"
response = session.head(url, allow_redirects=False, timeout=5)
if 'location' in response.headers:
l = response.headers['location']
return l.split('/')[-1]
def update_mapping(session, kb_ids):
print(f"Total Count: {len(kb_ids)}")
kb = None
updated = False
with open('data.json', 'r') as f:
kb = json.load(f)
i = 0
for kb_id in kb_ids:
i=i+1
if kb_id not in kb:
slug = get_url_slug(session, kb_id)
if slug:
new_data = parse_redirect(kb_id, slug)
if new_data:
updated = True
kb[kb_id] = new_data
print(f"Status: {i}/{len(kb_ids)}")
if updated:
with open('data.json', 'w') as f:
f.write(json.dumps(kb, indent=2))
def fetch_kb_mentions(session, url):
with session.get(url, timeout=10) as response:
print(url)
soup = BeautifulSoup(response.text, features="html5lib")
for a in soup.find('div', class_='content').find_all('a', href=True):
l = a['href']
if l.startswith('https://support.microsoft.com/kb/') or l.startswith('https://support.microsoft.com/help/'):
yield l.split('/')[4]
if __name__ == "__main__":
kbs = []
s = Session()
retries = Retry(
total=3,
backoff_factor=0.1,
status_forcelist=[502, 503, 504],
allowed_methods={'GET'},
)
s.mount('https://', HTTPAdapter(max_retries=retries))
with open('discovery.txt', 'r') as f:
for url in f.readlines():
url = url.strip()
for kb_id in fetch_kb_mentions(s, url):
kbs.append(kb_id)
update_mapping(s, kbs)