mirror of
https://github.com/captn3m0/microsoft-kb-metadata.git
synced 2024-09-16 10:16:06 +00:00
Add retry and timeouts
This commit is contained in:
parent
6e8b2563c6
commit
0157806e83
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
data.yml
|
||||
list.json
|
||||
.venv
|
@ -1,7 +1,11 @@
|
||||
beautifulsoup4==4.11.1
|
||||
beautifulsoup4==4.12.3
|
||||
certifi==2024.7.4
|
||||
charset-normalizer==3.3.2
|
||||
html5lib==1.1
|
||||
pyaml==21.10.1
|
||||
PyYAML==6.0
|
||||
idna==3.7
|
||||
PyYAML==6.0.1
|
||||
requests==2.32.3
|
||||
six==1.16.0
|
||||
soupsieve==2.3.2.post1
|
||||
soupsieve==2.5
|
||||
urllib3==2.2.2
|
||||
webencodings==0.5.1
|
||||
|
49
update.py
49
update.py
@ -3,10 +3,13 @@ import json
|
||||
import re
|
||||
import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib import request
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util import Retry
|
||||
from urllib.request import HTTPRedirectHandler
|
||||
from requests import Session
|
||||
import urllib.error
|
||||
|
||||
class NoRedirect(request.HTTPRedirectHandler):
|
||||
class NoRedirect(HTTPRedirectHandler):
|
||||
def redirect_request(self, req, fp, code, msg, headers, newurl):
|
||||
return None
|
||||
|
||||
@ -40,21 +43,13 @@ def parse_redirect(kb_id, slug):
|
||||
"url": f"https://support.microsoft.com/help/{kb_id}"
|
||||
}
|
||||
|
||||
def get_url_slug(kb_id):
|
||||
request.install_opener(request.build_opener(NoRedirect))
|
||||
url = f"https://support.microsoft.com/help/{kb_id}"
|
||||
r = urllib.request.Request(url, method="HEAD")
|
||||
try:
|
||||
response = urllib.request.urlopen(r, data=None, timeout=5)
|
||||
except urllib.error.HTTPError as response:
|
||||
if 'location' in response.headers:
|
||||
l = response.headers['location']
|
||||
return l.split('/')[-1]
|
||||
else:
|
||||
return None
|
||||
return None
|
||||
def get_url_slug(session, kb_id):
|
||||
response = session.head(link, allow_redirects=False, timeout=5)
|
||||
if 'location' in response.headers:
|
||||
l = response.headers['location']
|
||||
return l.split('/')[-1]
|
||||
|
||||
def update_mapping(kb_ids):
|
||||
def update_mapping(session, kb_ids):
|
||||
print(f"Total Count: {len(kb_ids)}")
|
||||
kb = None
|
||||
updated = False
|
||||
@ -65,7 +60,7 @@ def update_mapping(kb_ids):
|
||||
for kb_id in kb_ids:
|
||||
i=i+1
|
||||
if kb_id not in kb:
|
||||
slug = get_url_slug(kb_id)
|
||||
slug = get_url_slug(session, kb_id)
|
||||
if slug:
|
||||
new_data = parse_redirect(kb_id, slug)
|
||||
if new_data:
|
||||
@ -77,9 +72,10 @@ def update_mapping(kb_ids):
|
||||
with open('data.json', 'w') as f:
|
||||
f.write(json.dumps(kb, indent=2))
|
||||
|
||||
def fetch_kb_mentions(url):
|
||||
with urllib.request.urlopen(url, data=None, timeout=5) as response:
|
||||
soup = BeautifulSoup(response, features="html5lib")
|
||||
def fetch_kb_mentions(session, url):
|
||||
with session.get(url, timeout=10) as response:
|
||||
print(url)
|
||||
soup = BeautifulSoup(response.text, features="html5lib")
|
||||
for a in soup.find('div', class_='content').find_all('a', href=True):
|
||||
l = a['href']
|
||||
if l.startswith('https://support.microsoft.com/kb/') or l.startswith('https://support.microsoft.com/help/'):
|
||||
@ -88,8 +84,17 @@ def fetch_kb_mentions(url):
|
||||
|
||||
if __name__ == "__main__":
|
||||
kbs = []
|
||||
s = Session()
|
||||
retries = Retry(
|
||||
total=3,
|
||||
backoff_factor=0.1,
|
||||
status_forcelist=[502, 503, 504],
|
||||
allowed_methods={'GET'},
|
||||
)
|
||||
s.mount('https://', HTTPAdapter(max_retries=retries))
|
||||
with open('discovery.txt', 'r') as f:
|
||||
for url in f.readlines():
|
||||
for kb_id in fetch_kb_mentions(url):
|
||||
url = url.strip()
|
||||
for kb_id in fetch_kb_mentions(s, url):
|
||||
kbs.append(kb_id)
|
||||
update_mapping(kbs)
|
||||
update_mapping(s, kbs)
|
Loading…
Reference in New Issue
Block a user