2023-01-12 09:08:22 +00:00
|
|
|
import yaml
|
|
|
|
import json
|
|
|
|
import re
|
|
|
|
import datetime
|
|
|
|
from bs4 import BeautifulSoup
|
2023-01-12 10:12:13 +00:00
|
|
|
from urllib import request
|
|
|
|
import urllib.error
|
2023-01-12 09:08:22 +00:00
|
|
|
|
2023-01-12 10:12:13 +00:00
|
|
|
class NoRedirect(request.HTTPRedirectHandler):
|
|
|
|
def redirect_request(self, req, fp, code, msg, headers, newurl):
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DATE_REGEX = r'(?:(?P<year>\d{4})-)?(?P<month>january|february|march|april|may|june|july|august|september|october|november|december)-(?P<date>\d+)(?:-(?P<year2>\d{4})-)?'
|
2023-01-12 09:08:22 +00:00
|
|
|
|
2023-01-12 10:12:13 +00:00
|
|
|
def parse_redirect(kb_id, slug):
|
2023-01-12 09:08:22 +00:00
|
|
|
m = re.search(DATE_REGEX, slug)
|
|
|
|
if m == None:
|
|
|
|
return None
|
|
|
|
else:
|
|
|
|
y = m.group('year') or m.group('year2')
|
2023-01-12 10:12:13 +00:00
|
|
|
d = int(m.group('date'))
|
|
|
|
if y == None:
|
|
|
|
# if date actually contains the year
|
|
|
|
# set date to 1, and set year to date
|
|
|
|
if d>1900 and d<2030:
|
|
|
|
y = d
|
|
|
|
d = 1
|
|
|
|
else:
|
|
|
|
return None
|
|
|
|
if d>30:
|
|
|
|
return None
|
|
|
|
date_s = f"{d} {m.group('month').title()} {y}"
|
2023-01-12 09:08:22 +00:00
|
|
|
date = datetime.datetime.strptime(date_s, "%d %B %Y").strftime("%Y-%m-%d")
|
|
|
|
return {
|
|
|
|
"date": date,
|
|
|
|
"uuid": slug[-36:],
|
|
|
|
"slug": slug,
|
2023-01-12 10:12:13 +00:00
|
|
|
"url": f"https://support.microsoft.com/help/{kb_id}"
|
2023-01-12 09:08:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
def get_url_slug(kb_id):
|
2023-01-12 10:12:13 +00:00
|
|
|
request.install_opener(request.build_opener(NoRedirect))
|
|
|
|
url = f"https://support.microsoft.com/help/{kb_id}"
|
|
|
|
r = urllib.request.Request(url, method="HEAD")
|
|
|
|
try:
|
|
|
|
response = urllib.request.urlopen(r, data=None, timeout=5)
|
|
|
|
except urllib.error.HTTPError as response:
|
|
|
|
if 'location' in response.headers:
|
|
|
|
l = response.headers['location']
|
|
|
|
return l.split('/')[-1]
|
|
|
|
else:
|
|
|
|
return None
|
|
|
|
return None
|
2023-01-12 09:08:22 +00:00
|
|
|
|
|
|
|
def update_mapping(kb_ids):
|
2023-01-12 10:12:13 +00:00
|
|
|
print(f"Total Count: {len(kb_ids)}")
|
2023-01-12 09:08:22 +00:00
|
|
|
kb = None
|
|
|
|
updated = False
|
|
|
|
with open('data.json', 'r') as f:
|
|
|
|
kb = json.load(f)
|
|
|
|
|
2023-01-12 10:12:13 +00:00
|
|
|
i = 0
|
|
|
|
for kb_id in kb_ids:
|
|
|
|
i=i+1
|
|
|
|
if kb_id not in kb:
|
|
|
|
slug = get_url_slug(kb_id)
|
|
|
|
if slug:
|
|
|
|
new_data = parse_redirect(kb_id, slug)
|
2023-01-12 09:08:22 +00:00
|
|
|
if new_data:
|
|
|
|
updated = True
|
|
|
|
kb[kb_id] = new_data
|
2023-01-12 10:12:13 +00:00
|
|
|
print(f"Status: {i}/{len(kb_ids)}")
|
2023-01-12 09:08:22 +00:00
|
|
|
|
|
|
|
if updated:
|
|
|
|
with open('data.json', 'w') as f:
|
|
|
|
f.write(json.dumps(kb, indent=2))
|
|
|
|
|
|
|
|
def fetch_kb_mentions(url):
|
|
|
|
with urllib.request.urlopen(url, data=None, timeout=5) as response:
|
|
|
|
soup = BeautifulSoup(response, features="html5lib")
|
|
|
|
for a in soup.find('div', class_='content').find_all('a', href=True):
|
|
|
|
l = a['href']
|
|
|
|
if l.startswith('https://support.microsoft.com/kb/') or l.startswith('https://support.microsoft.com/help/'):
|
|
|
|
yield l.split('/')[4]
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
kbs = []
|
|
|
|
with open('discovery.txt', 'r') as f:
|
|
|
|
for url in f.readlines():
|
|
|
|
for kb_id in fetch_kb_mentions(url):
|
|
|
|
kbs.append(kb_id)
|
|
|
|
update_mapping(kbs)
|