Now, although it takes 5GB to run this once, we get a more detailed CSV

This commit is contained in:
Nemo 2020-07-30 03:48:57 +05:30
parent 1e67eb8281
commit 5921640da3
2 changed files with 9 additions and 23 deletions

View File

@ -4,25 +4,9 @@ require 'date'
restaurants = [] restaurants = []
CSV.open("data/#{Date.today.to_s}.csv", 'wb') do |csv| Dir.glob('html/restaurants-*.html') do |file|
csv << ['url', 'title', 'location', 'address', 'cuisine'] page = Nokogiri::HTML(open(file))
Dir.glob('html/restaurants-*.html') do |file| page.css('.plr10').each do |div|
page = Nokogiri::HTML(open(file)) puts div.css('a')[0]['href']
page.css('.plr10').each do |div|
links = div.css('a')
spans = div.css('span')
title = links[0].text
location = links[1].text
address = spans[1].text
cuisine = spans[0].text
url = links[0]['href']
csv << [url, title, location, address, cuisine]
puts url
end
end end
end end

View File

@ -7,7 +7,7 @@ require "csv"
restaurants = [] restaurants = []
csv = CSV.open("data/#{Date.today.to_s}-all.csv", 'w') csv = CSV.open("data/#{Date.today.to_s}.csv", 'w')
first = true first = true
Dir.glob('html/restaurants/*.html') do |file| Dir.glob('html/restaurants/*.html') do |file|
@ -59,9 +59,11 @@ Dir.glob('html/restaurants/*.html') do |file|
end end
rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text'] rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text']
if ratings and ratings.has_key? 'rating_streak' if ratings and ratings.has_key? 'rating_streak'
rest['last_rating_timestamp'] = ratings['rating_streak'][-1]['timestamp'] rest['last_rating_timestamp'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m-%d')
rest['last_rating_month'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m')
else else
rest['last_rating_timestamp'] = Date.new(2020, 01,01).strftime('%s') rest['last_rating_timestamp'] = '1970-01-01'
rest['last_rating_month'] = '1970-01'
end end
rest['id'] = data['pages']['current']['resId'] rest['id'] = data['pages']['current']['resId']
if first if first