Add retry and timeouts

This commit is contained in:
Nemo 2024-07-04 10:29:24 +05:30
parent 6e8b2563c6
commit 0157806e83
3 changed files with 37 additions and 27 deletions

3
.gitignore vendored
View File

@ -1,2 +1,3 @@
data.yml data.yml
list.json list.json
.venv

View File

@ -1,7 +1,11 @@
beautifulsoup4==4.11.1 beautifulsoup4==4.12.3
certifi==2024.7.4
charset-normalizer==3.3.2
html5lib==1.1 html5lib==1.1
pyaml==21.10.1 idna==3.7
PyYAML==6.0 PyYAML==6.0.1
requests==2.32.3
six==1.16.0 six==1.16.0
soupsieve==2.3.2.post1 soupsieve==2.5
urllib3==2.2.2
webencodings==0.5.1 webencodings==0.5.1

View File

@ -3,10 +3,13 @@ import json
import re import re
import datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib import request from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from urllib.request import HTTPRedirectHandler
from requests import Session
import urllib.error import urllib.error
class NoRedirect(request.HTTPRedirectHandler): class NoRedirect(HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl): def redirect_request(self, req, fp, code, msg, headers, newurl):
return None return None
@ -40,21 +43,13 @@ def parse_redirect(kb_id, slug):
"url": f"https://support.microsoft.com/help/{kb_id}" "url": f"https://support.microsoft.com/help/{kb_id}"
} }
def get_url_slug(kb_id): def get_url_slug(session, kb_id):
request.install_opener(request.build_opener(NoRedirect)) response = session.head(link, allow_redirects=False, timeout=5)
url = f"https://support.microsoft.com/help/{kb_id}" if 'location' in response.headers:
r = urllib.request.Request(url, method="HEAD") l = response.headers['location']
try: return l.split('/')[-1]
response = urllib.request.urlopen(r, data=None, timeout=5)
except urllib.error.HTTPError as response:
if 'location' in response.headers:
l = response.headers['location']
return l.split('/')[-1]
else:
return None
return None
def update_mapping(kb_ids): def update_mapping(session, kb_ids):
print(f"Total Count: {len(kb_ids)}") print(f"Total Count: {len(kb_ids)}")
kb = None kb = None
updated = False updated = False
@ -65,7 +60,7 @@ def update_mapping(kb_ids):
for kb_id in kb_ids: for kb_id in kb_ids:
i=i+1 i=i+1
if kb_id not in kb: if kb_id not in kb:
slug = get_url_slug(kb_id) slug = get_url_slug(session, kb_id)
if slug: if slug:
new_data = parse_redirect(kb_id, slug) new_data = parse_redirect(kb_id, slug)
if new_data: if new_data:
@ -77,9 +72,10 @@ def update_mapping(kb_ids):
with open('data.json', 'w') as f: with open('data.json', 'w') as f:
f.write(json.dumps(kb, indent=2)) f.write(json.dumps(kb, indent=2))
def fetch_kb_mentions(url): def fetch_kb_mentions(session, url):
with urllib.request.urlopen(url, data=None, timeout=5) as response: with session.get(url, timeout=10) as response:
soup = BeautifulSoup(response, features="html5lib") print(url)
soup = BeautifulSoup(response.text, features="html5lib")
for a in soup.find('div', class_='content').find_all('a', href=True): for a in soup.find('div', class_='content').find_all('a', href=True):
l = a['href'] l = a['href']
if l.startswith('https://support.microsoft.com/kb/') or l.startswith('https://support.microsoft.com/help/'): if l.startswith('https://support.microsoft.com/kb/') or l.startswith('https://support.microsoft.com/help/'):
@ -88,8 +84,17 @@ def fetch_kb_mentions(url):
if __name__ == "__main__": if __name__ == "__main__":
kbs = [] kbs = []
s = Session()
retries = Retry(
total=3,
backoff_factor=0.1,
status_forcelist=[502, 503, 504],
allowed_methods={'GET'},
)
s.mount('https://', HTTPAdapter(max_retries=retries))
with open('discovery.txt', 'r') as f: with open('discovery.txt', 'r') as f:
for url in f.readlines(): for url in f.readlines():
for kb_id in fetch_kb_mentions(url): url = url.strip()
for kb_id in fetch_kb_mentions(s, url):
kbs.append(kb_id) kbs.append(kb_id)
update_mapping(kbs) update_mapping(s, kbs)