40 lines
1.2 KiB
Python
40 lines
1.2 KiB
Python
from lxml import etree
|
|
import csv
|
|
import json
|
|
from glob import glob
|
|
from urllib.parse import urlparse
|
|
import operator
|
|
|
|
def extract_urls_from_sitemaps():
|
|
url_set = set()
|
|
f = glob('sitemaps/tools*.xml')
|
|
for file in f:
|
|
with open(file) as xml_file:
|
|
tree = etree.parse(xml_file)
|
|
root = tree.getroot()
|
|
for url in tree.xpath("//*[local-name()='loc']/text()"):
|
|
url_set.add(url)
|
|
|
|
return url_set
|
|
|
|
def write_packages_csv():
|
|
rows = []
|
|
with open('prefixes.json') as prefix_file, open('packages.csv', 'w', newline='') as packages_csv:
|
|
prefixes = json.load(prefix_file)
|
|
packages_writer = csv.writer(packages_csv)
|
|
packages_writer.writerow(['url', 'ecosystem'])
|
|
|
|
for url in extract_urls_from_sitemaps():
|
|
parsed_url = urlparse(url)
|
|
path_parts = parsed_url.path.split('/')
|
|
|
|
package = None
|
|
if path_parts and path_parts[1] and path_parts[1].split('-')[0] in prefixes:
|
|
package = path_parts[1].split('-')[0]
|
|
rows.append([url, package])
|
|
|
|
sortedlist = sorted(rows, key=lambda x: (x[1],x[1]))
|
|
packages_writer.writerows(sortedlist)
|
|
|
|
if __name__ == '__main__':
|
|
write_packages_csv() |