hn-classics/add-metadata.rb

80 lines
1.8 KiB
Ruby

require 'json'
require 'yaml'
require 'curb'
require 'front_matter_parser'
stories = JSON.parse File.read 'stories.json'
def is_hard_url(url)
# We don't want PDFs for now
if url.split(//).last(4).join === '.pdf'
return true
# Or fermatlibrary links
elsif /fermatslibrary/.match(url)
return true
end
return false
end
def process(url, story, fn)
puts "[DL] #{url}"
# Convert to GFM
markdown = `python parse.py "#{url}" | pandoc --from=html --to=gfm-raw_html-native_divs-native_spans-fenced_divs`
if markdown.size > 100
content = "#{story.to_yaml}\n---\n#{markdown}"
File.open(fn, "w") { |file| file.write content }
puts "[info] Saved"
end
end
def write_story(file_name, front_matter, content)
file_contents = "#{front_matter.to_yaml}\n---\n#{content}"
puts "Writing to #{file_name}"
File.open(file_name, "w") { |file| file.write file_contents }
end
stories.each do |year, storiesByYear|
storiesByYear.each do |story|
story.delete '_highlightResult'
id = story['objectID']
url = story['url']
fn = "_stories/#{year}/#{id}.md"
next if url.nil?
if is_hard_url(url)
File.delete fn if File.exist? fn
next
end
story['year'] = fn.split("/")[1].to_i
if File.exists? fn
puts fn
parsed = FrontMatterParser::Parser.parse_file(fn)
# If no frontmatter or if it does not contain the year
if parsed.front_matter.nil? or parsed.front_matter.has_key?('year') == false
write_story(fn, story, parsed.content)
end
# File is empty
if parsed.content.strip.empty? or parsed.content.strip.split("\n").size == 1
File.delete fn
process url, story, fn
end
else
next
begin
process url, story, fn
rescue StandardError => e
next
end
end
end
end