Adds restaurant parser
This commit is contained in:
parent
efba5ac0cf
commit
48adab4201
|
@ -1,4 +1,99 @@
|
|||
require 'nokogiri'
|
||||
require 'date'
|
||||
require 'cgi'
|
||||
require 'uri'
|
||||
require 'json'
|
||||
require "csv"
|
||||
|
||||
restaurants = []
|
||||
restaurants = []
|
||||
|
||||
csv = CSV.open("data/#{Date.today.to_s}-all.csv", 'w')
|
||||
|
||||
# restaurants = CSV.read("data/#{Date.today.to_s}.csv", headers: true)
|
||||
# restaurants_map = {}
|
||||
|
||||
# restaurants.each do |row|
|
||||
# begin
|
||||
# url = URI.parse(row['url'])
|
||||
# rescue URI::InvalidURIError
|
||||
# url = URI.parse(CGI.escape(row['url']))
|
||||
# end
|
||||
# id = row['url'].split('/').last
|
||||
# restaurants_map[url.path.strip] = {
|
||||
# 'url': url,
|
||||
# 'title': row['title'],
|
||||
# 'location':row['location'],
|
||||
# 'address':row['address']
|
||||
# }
|
||||
# end
|
||||
|
||||
# def join_props(h, key, data)
|
||||
# data.keys.each do |k|
|
||||
# h["#{key}.#{k}"] = data[k]
|
||||
# end
|
||||
# end
|
||||
|
||||
first = true
|
||||
Dir.glob('html/restaurants/*.html') do |file|
|
||||
# puts file
|
||||
url = nil
|
||||
rest = Hash.new
|
||||
File.readlines(file).each do |l|
|
||||
if l.match /__PRELOADED_STATE__/
|
||||
start = l.index("JSON.parse(") + "JSON.parse(".length
|
||||
end_ = -3
|
||||
data = JSON.parse(JSON.parse l[start..end_])
|
||||
url = data['pages']['current']['pageUrl'].strip
|
||||
r = data['pages']['restaurant'][data['pages']['restaurant'].keys.first]
|
||||
if data['pages']['current']['name'] == '404'
|
||||
next
|
||||
end
|
||||
rest['rating'] = r['sections']['SECTION_BASIC_INFO']['rating']['aggregate_rating']
|
||||
rest['rating_text'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_text']
|
||||
rest['rating_subtitle'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_subtitle']
|
||||
rest['rating_votes'] = r['sections']['SECTION_BASIC_INFO']['rating']['votes']
|
||||
rest["is_delivery_only"] = r['sections']["SECTION_BASIC_INFO"]["is_delivery_only"]
|
||||
rest["name"] = data['pages']['current']['pageTitle'].strip
|
||||
rest["is_perm_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_perm_closed"]
|
||||
rest["is_temp_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_temp_closed"]
|
||||
rest["is_opening_soon"] = r['sections']["SECTION_BASIC_INFO"]["is_opening_soon"]
|
||||
rest["should_ban_ugc"] = r['sections']["SECTION_BASIC_INFO"]["should_ban_ugc"]
|
||||
rest["is_shelled"] = r['sections']["SECTION_BASIC_INFO"]["is_shelled"]
|
||||
rest["disclaimer_text"] = r['sections']["SECTION_BASIC_INFO"]["disclaimer_text"].strip
|
||||
rest["cuisines"] = r['sections']["SECTION_RES_HEADER_DETAILS"]["CUISINES"].map{|e| e['name']}.join(', ').strip
|
||||
ratings = r['sections']['SECTION_RATING_HISTOGRAM']
|
||||
h = ratings['histogram'] if ratings and ratings.has_key? 'histogram'
|
||||
# Ratings with 5,4,3,2,1 stars
|
||||
rest['rating_5_count'] = h ? h[0]['value'] : 0
|
||||
rest['rating_4_count'] = h ? h[1]['value'] : 0
|
||||
rest['rating_3_count'] = h ? h[2]['value'] : 0
|
||||
rest['rating_2_count'] = h ? h[3]['value'] : 0
|
||||
rest['rating_1_count'] = h ? h[4]['value'] : 0
|
||||
rest["zipcode"] = r['sections']['SECTION_RES_CONTACT']["zipcode"]
|
||||
rest["is_dark_kitchen"] = r['sections']['SECTION_RES_CONTACT']["is_dark_kitchen"]
|
||||
rest["locality_verbose"] = r['sections']['SECTION_RES_CONTACT']["locality_verbose"].strip
|
||||
rest["latitude"] = r['sections']['SECTION_RES_CONTACT']["latitude"]
|
||||
rest["longitude"] = r['sections']['SECTION_RES_CONTACT']["longitude"]
|
||||
rest["address"] = r['sections']['SECTION_RES_CONTACT']["address"].strip
|
||||
rest["contact"] = r['sections']['SECTION_RES_CONTACT']["is_phone_available"] ? r['sections']['SECTION_RES_CONTACT']["phoneDetails"]["phoneStr"] : ""
|
||||
if r['sections']['SECTION_RES_CONTACT']["res_chain_text"].empty?
|
||||
rest["chain_count"] = 0
|
||||
else
|
||||
rest["chain_count"] = r['sections']['SECTION_RES_CONTACT']["res_chain_text"].match(/\d+/)[0]
|
||||
end
|
||||
rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text']
|
||||
if ratings and ratings.has_key? 'rating_streak'
|
||||
rest['last_rating_timestamp'] = ratings['rating_streak'][-1]['timestamp']
|
||||
else
|
||||
rest['last_rating_timestamp'] = Date.new(2020, 01,01).strftime('%s')
|
||||
end
|
||||
rest['id'] = data['pages']['current']['resId']
|
||||
if first
|
||||
csv << rest.keys
|
||||
first = false
|
||||
end
|
||||
csv << rest.values
|
||||
end
|
||||
end
|
||||
|
||||
end
|
Loading…
Reference in New Issue