Data Update, etc
This commit is contained in:
parent
a974526e61
commit
cebf6483c9
11
README.md
11
README.md
|
@ -1,10 +1,15 @@
|
|||
# Microsoft KB metadata
|
||||
# Microsoft Knowledge Base metadata
|
||||
|
||||
This repository hosts a small subset of the Microsoft Knowledgebase metadata. The data in the `data.json` contains the following:
|
||||
|
||||
1. Date of the KB publication
|
||||
2. KB UUID
|
||||
3. KB Slug
|
||||
4. KB URL (en-us)
|
||||
4. KB URL
|
||||
|
||||
The KBs are hand-picked for now.
|
||||
The list of KB IDs in the database is scraped from the URLs in `discovery.txt`. The primary usecase of the dataset is to provide a `KB:DATE`
|
||||
mapping to other projects.
|
||||
|
||||
## license
|
||||
|
||||
Data and code in this repository is licensed under [Creative Commons Zero v1.0 Universal](https://choosealicense.com/licenses/cc0-1.0/#).
|
64
update.py
64
update.py
|
@ -3,45 +3,81 @@ import json
|
|||
import re
|
||||
import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.request
|
||||
DATE_REGEX = r'(?:(?P<year>\d{4})-)?(?P<month>january|february|march|april|may|june|july|august|september|october|november|december)-(?P<date>\d+)(?:-(?P<year2>\d{4}))?'
|
||||
from urllib import request
|
||||
import urllib.error
|
||||
|
||||
# load data from data.yml
|
||||
redirect_data = yaml.safe_load(open('data.yml'))
|
||||
class NoRedirect(request.HTTPRedirectHandler):
|
||||
def redirect_request(self, req, fp, code, msg, headers, newurl):
|
||||
return None
|
||||
|
||||
def parse_redirect(slug):
|
||||
|
||||
|
||||
DATE_REGEX = r'(?:(?P<year>\d{4})-)?(?P<month>january|february|march|april|may|june|july|august|september|october|november|december)-(?P<date>\d+)(?:-(?P<year2>\d{4})-)?'
|
||||
|
||||
def parse_redirect(kb_id, slug):
|
||||
m = re.search(DATE_REGEX, slug)
|
||||
if m == None:
|
||||
return None
|
||||
else:
|
||||
y = m.group('year') or m.group('year2')
|
||||
date_s = f"{m.group('date')} {m.group('month').title()} {y}"
|
||||
d = int(m.group('date'))
|
||||
if y == None:
|
||||
# if date actually contains the year
|
||||
# set date to 1, and set year to date
|
||||
if d>1900 and d<2030:
|
||||
y = d
|
||||
d = 1
|
||||
else:
|
||||
return None
|
||||
if d>30:
|
||||
return None
|
||||
date_s = f"{d} {m.group('month').title()} {y}"
|
||||
date = datetime.datetime.strptime(date_s, "%d %B %Y").strftime("%Y-%m-%d")
|
||||
return {
|
||||
"date": date,
|
||||
"uuid": slug[-36:],
|
||||
"slug": slug,
|
||||
"url": f"https://support.microsoft.com/en-us/topic/{slug}"
|
||||
"url": f"https://support.microsoft.com/help/{kb_id}"
|
||||
}
|
||||
|
||||
def get_url_slug(kb_id):
|
||||
return redirect_data[int(kb_id)]['redirect']
|
||||
request.install_opener(request.build_opener(NoRedirect))
|
||||
url = f"https://support.microsoft.com/help/{kb_id}"
|
||||
r = urllib.request.Request(url, method="HEAD")
|
||||
try:
|
||||
response = urllib.request.urlopen(r, data=None, timeout=5)
|
||||
except urllib.error.HTTPError as response:
|
||||
if 'location' in response.headers:
|
||||
l = response.headers['location']
|
||||
print(l)
|
||||
return l.split('/')[-1]
|
||||
else:
|
||||
return None
|
||||
return None
|
||||
|
||||
def update_mapping(kb_ids):
|
||||
print(f"Total Count: {len(kb_ids)}")
|
||||
kb = None
|
||||
updated = False
|
||||
with open('data.json', 'r') as f:
|
||||
kb = json.load(f)
|
||||
|
||||
with open(kb_json_file, 'r') as f:
|
||||
for kb_id in kb_ids:
|
||||
if kb_id not in kb:
|
||||
|
||||
slug = get_url_slug(kb_id)
|
||||
new_data = parse_redirect(slug)
|
||||
i = 0
|
||||
for kb_id in kb_ids:
|
||||
i=i+1
|
||||
if kb_id not in kb:
|
||||
print(kb_id)
|
||||
slug = get_url_slug(kb_id)
|
||||
if slug:
|
||||
print(slug)
|
||||
new_data = parse_redirect(kb_id, slug)
|
||||
print(new_data)
|
||||
if new_data:
|
||||
updated = True
|
||||
kb[kb_id] = new_data
|
||||
print(f"Status: {i}/{len(kb_ids)}")
|
||||
else:
|
||||
print("no slug")
|
||||
|
||||
if updated:
|
||||
with open('data.json', 'w') as f:
|
||||
|
|
Loading…
Reference in New Issue