Browse Source

Now, although it takes 5GB to run this once, we get a more detailed CSV

master
Nemo 11 months ago
parent
commit
5921640da3
  1. 24
      parse_listing.rb
  2. 8
      parse_restaurants.rb

24
parse_listing.rb

@ -4,25 +4,9 @@ require 'date'
restaurants = []
CSV.open("data/#{Date.today.to_s}.csv", 'wb') do |csv|
csv << ['url', 'title', 'location', 'address', 'cuisine']
Dir.glob('html/restaurants-*.html') do |file|
page = Nokogiri::HTML(open(file))
page.css('.plr10').each do |div|
links = div.css('a')
spans = div.css('span')
title = links[0].text
location = links[1].text
address = spans[1].text
cuisine = spans[0].text
url = links[0]['href']
csv << [url, title, location, address, cuisine]
puts url
end
Dir.glob('html/restaurants-*.html') do |file|
page = Nokogiri::HTML(open(file))
page.css('.plr10').each do |div|
puts div.css('a')[0]['href']
end
end

8
parse_restaurants.rb

@ -7,7 +7,7 @@ require "csv"
restaurants = []
csv = CSV.open("data/#{Date.today.to_s}-all.csv", 'w')
csv = CSV.open("data/#{Date.today.to_s}.csv", 'w')
first = true
Dir.glob('html/restaurants/*.html') do |file|
@ -59,9 +59,11 @@ Dir.glob('html/restaurants/*.html') do |file|
end
rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text']
if ratings and ratings.has_key? 'rating_streak'
rest['last_rating_timestamp'] = ratings['rating_streak'][-1]['timestamp']
rest['last_rating_timestamp'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m-%d')
rest['last_rating_month'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m')
else
rest['last_rating_timestamp'] = Date.new(2020, 01,01).strftime('%s')
rest['last_rating_timestamp'] = '1970-01-01'
rest['last_rating_month'] = '1970-01'
end
rest['id'] = data['pages']['current']['resId']
if first

Loading…
Cancel
Save