hn-classics/add-metadata.rb

80 lines
1.8 KiB
Ruby
Raw Permalink Normal View History

require 'json'
require 'yaml'
2018-02-25 10:48:00 +00:00
require 'curb'
require 'front_matter_parser'
stories = JSON.parse File.read 'stories.json'
2018-02-25 20:11:00 +00:00
def is_hard_url(url)
2018-03-03 09:35:28 +00:00
# We don't want PDFs for now
if url.split(//).last(4).join === '.pdf'
return true
# Or fermatlibrary links
elsif /fermatslibrary/.match(url)
return true
end
2018-02-25 20:11:00 +00:00
2018-03-03 09:35:28 +00:00
return false
2018-02-25 20:11:00 +00:00
end
2018-02-25 10:48:00 +00:00
2018-03-03 09:35:28 +00:00
def process(url, story, fn)
puts "[DL] #{url}"
2018-02-25 10:48:00 +00:00
2018-03-03 09:35:28 +00:00
# Convert to GFM
markdown = `python parse.py "#{url}" | pandoc --from=html --to=gfm-raw_html-native_divs-native_spans-fenced_divs`
2018-02-25 10:48:00 +00:00
2018-03-03 09:35:28 +00:00
if markdown.size > 100
content = "#{story.to_yaml}\n---\n#{markdown}"
File.open(fn, "w") { |file| file.write content }
puts "[info] Saved"
end
end
2018-02-25 10:48:00 +00:00
2018-06-08 12:05:27 +00:00
def write_story(file_name, front_matter, content)
file_contents = "#{front_matter.to_yaml}\n---\n#{content}"
puts "Writing to #{file_name}"
File.open(file_name, "w") { |file| file.write file_contents }
end
2018-03-03 09:35:28 +00:00
stories.each do |year, storiesByYear|
storiesByYear.each do |story|
story.delete '_highlightResult'
id = story['objectID']
url = story['url']
2018-02-25 10:51:33 +00:00
2018-03-03 09:35:28 +00:00
fn = "_stories/#{year}/#{id}.md"
2018-02-25 20:11:00 +00:00
2018-03-03 09:35:28 +00:00
next if url.nil?
2018-02-25 20:11:00 +00:00
2018-03-03 09:35:28 +00:00
if is_hard_url(url)
File.delete fn if File.exist? fn
next
end
2018-02-25 20:11:00 +00:00
2018-06-08 12:05:27 +00:00
story['year'] = fn.split("/")[1].to_i
2018-03-03 09:35:28 +00:00
if File.exists? fn
2018-06-12 15:10:59 +00:00
puts fn
2018-03-03 09:35:28 +00:00
parsed = FrontMatterParser::Parser.parse_file(fn)
2018-06-08 12:05:27 +00:00
# If no frontmatter or if it does not contain the year
if parsed.front_matter.nil? or parsed.front_matter.has_key?('year') == false
write_story(fn, story, parsed.content)
2018-03-03 09:35:28 +00:00
end
# File is empty
if parsed.content.strip.empty? or parsed.content.strip.split("\n").size == 1
2018-06-12 15:10:59 +00:00
File.delete fn
2018-03-03 09:35:28 +00:00
process url, story, fn
end
else
2018-06-08 12:05:27 +00:00
next
2018-03-03 09:35:28 +00:00
begin
process url, story, fn
rescue StandardError => e
next
end
end
end
end