Initial Commit
This commit is contained in:
commit
f89e5f4f14
|
@ -0,0 +1,2 @@
|
|||
*.xml
|
||||
**.pyc
|
|
@ -0,0 +1,12 @@
|
|||
sitemaps/tools4.xml: sitemaps/tools3.xml
|
||||
wget -P sitemaps --quiet --timestamping https://stackshare.io/sitemaps/tools4.xml
|
||||
sitemaps/tools3.xml: sitemaps/tools2.xml
|
||||
wget -P sitemaps --quiet --timestamping https://stackshare.io/sitemaps/tools3.xml
|
||||
sitemaps/tools2.xml: sitemaps/tools.xml
|
||||
wget -P sitemaps --quiet --timestamping https://stackshare.io/sitemaps/tools.xml
|
||||
sitemaps/tools.xml:
|
||||
wget -P sitemaps --quiet --timestamping https://stackshare.io/sitemaps/tools.xml
|
||||
packages.csv: sitemaps/tools4.xml
|
||||
python src/packages.py
|
||||
tools.csv: packages.csv
|
||||
python src/tools.py
|
|
@ -0,0 +1,7 @@
|
|||
# stackshare-dataset
|
||||
|
||||
A dataset from stackshare.io providing lists of packages and various services. While a list of packages for
|
||||
various ecosystems is easily available elsewhere, a list of services is much harder.
|
||||
|
||||
See `tools.csv` for a complete list. I'd recommend sorting by populatity and using the top 2.5-3k results
|
||||
depending on your usecase.
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,28 @@
|
|||
[
|
||||
"helm",
|
||||
"inqlude",
|
||||
"purescript",
|
||||
"haxelib",
|
||||
"nimble",
|
||||
"dub",
|
||||
"homebrew",
|
||||
"conda",
|
||||
"elm",
|
||||
"julia",
|
||||
"carthage",
|
||||
"swiftpm",
|
||||
"clojars",
|
||||
"pub",
|
||||
"cocoapods",
|
||||
"cran",
|
||||
"hackage",
|
||||
"bower",
|
||||
"rubygems",
|
||||
"packagist",
|
||||
"pypi",
|
||||
"nuget",
|
||||
"npm",
|
||||
"terraform",
|
||||
"go",
|
||||
"maven"
|
||||
]
|
|
@ -0,0 +1,40 @@
|
|||
from lxml import etree
|
||||
import csv
|
||||
import json
|
||||
from glob import glob
|
||||
from urllib.parse import urlparse
|
||||
import operator
|
||||
|
||||
def extract_urls_from_sitemaps():
|
||||
url_set = set()
|
||||
f = glob('sitemaps/tools*.xml')
|
||||
for file in f:
|
||||
with open(file) as xml_file:
|
||||
tree = etree.parse(xml_file)
|
||||
root = tree.getroot()
|
||||
for url in tree.xpath("//*[local-name()='loc']/text()"):
|
||||
url_set.add(url)
|
||||
|
||||
return url_set
|
||||
|
||||
def write_packages_csv():
|
||||
rows = []
|
||||
with open('prefixes.json') as prefix_file, open('packages.csv', 'w', newline='') as packages_csv:
|
||||
prefixes = json.load(prefix_file)
|
||||
packages_writer = csv.writer(packages_csv)
|
||||
packages_writer.writerow(['url', 'ecosystem'])
|
||||
|
||||
for url in extract_urls_from_sitemaps():
|
||||
parsed_url = urlparse(url)
|
||||
path_parts = parsed_url.path.split('/')
|
||||
|
||||
package = None
|
||||
if path_parts and path_parts[1] and path_parts[1].split('-')[0] in prefixes:
|
||||
package = path_parts[1].split('-')[0]
|
||||
rows.append([url, package])
|
||||
|
||||
sortedlist = sorted(rows, key=lambda x: (x[1],x[1]))
|
||||
packages_writer.writerows(sortedlist)
|
||||
|
||||
if __name__ == '__main__':
|
||||
write_packages_csv()
|
|
@ -0,0 +1,94 @@
|
|||
import http.client
|
||||
import csv
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
import os
|
||||
import urllib.parse
|
||||
from math import floor
|
||||
from packages import extract_urls_from_sitemaps
|
||||
|
||||
HEADERS = ['url', 'object_id', 'name', 'title', 'popularity', 'votes', 'verified', 'description', 'stack_count', 'type', 'category', 'layer', 'function']
|
||||
conn = http.client.HTTPSConnection("km8652f2eg-dsn.algolia.net")
|
||||
|
||||
def tools_except_packages():
|
||||
all_tools = extract_urls_from_sitemaps()
|
||||
packages = set()
|
||||
with open('packages.csv') as packages_file:
|
||||
for row in csv.reader(packages_file):
|
||||
packages.add(row[0])
|
||||
return all_tools - packages
|
||||
|
||||
def make_request(search):
|
||||
payload = {
|
||||
"query": search,
|
||||
"hitsPerPage": 3,
|
||||
"filters": "NOT type:Stackup"
|
||||
}
|
||||
payload = json.dumps(payload)
|
||||
|
||||
headers = {
|
||||
'Accept': "application/json",
|
||||
'Accept-Encoding': "deflate",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
conn.request("POST", "/1/indexes/Search_production/query?x-algolia-application-id=KM8652F2EG&x-algolia-api-key=YzFhZWIwOGRhOWMyMjdhZTI5Yzc2OWM4OWFkNzc3ZTVjZGFkNDdmMThkZThiNDEzN2Y1NmI3MTQxYjM4MDI3MmZpbHRlcnM9cHJpdmF0ZSUzRDA%3D", payload, headers)
|
||||
|
||||
res = conn.getresponse()
|
||||
data = res.read()
|
||||
d = json.loads(data.decode("utf-8"))
|
||||
if len(d['hits']) >= 1:
|
||||
for x in d['hits']:
|
||||
if x['canonical_url'] == search:
|
||||
return x
|
||||
break
|
||||
else:
|
||||
print(f"MISS: {search}")
|
||||
|
||||
def get_row(ignore_set):
|
||||
for url in tools_except_packages():
|
||||
if url in ignore_set:
|
||||
continue
|
||||
data = make_request(urlparse(url).path)
|
||||
if data and data['is_package'] == False:
|
||||
yield [
|
||||
url,
|
||||
data['objectID'],
|
||||
data['name'],
|
||||
data['title'],
|
||||
data['popularity'],
|
||||
data['votes_count'],
|
||||
data['verified'],
|
||||
data['description'],
|
||||
data['company_stacks_count'],
|
||||
data['type'].lower(),
|
||||
data['category']['slug'],
|
||||
data['layer'].lower(),
|
||||
data['function']['slug']
|
||||
]
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not os.path.exists('tools.csv'):
|
||||
with open('tools.csv', 'w', newline='') as of:
|
||||
writer = csv.writer(of)
|
||||
writer.writerow(HEADERS)
|
||||
|
||||
urls_written = set()
|
||||
with open('tools.csv') as tools_file:
|
||||
for row in csv.reader(tools_file):
|
||||
urls_written.add(row[0])
|
||||
|
||||
with open('tools.csv', 'a', newline='') as of:
|
||||
writer = csv.writer(of)
|
||||
for row in get_row(urls_written):
|
||||
writer.writerow(row)
|
||||
|
||||
# Sort the tools.csv file by 4th column (popularity)
|
||||
with open('tools.csv', 'r') as f:
|
||||
reader = csv.reader(f)
|
||||
next(reader, None) # skip the header
|
||||
sortedlist = sorted(reader, key=lambda row: float(row[4]), reverse=True)
|
||||
with open('tools.csv', 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(HEADERS)
|
||||
writer.writerows(sortedlist)
|
Loading…
Reference in New Issue