From 48adab420101072ce3587723545d076357fb1f3c Mon Sep 17 00:00:00 2001 From: Nemo Date: Thu, 30 Jul 2020 03:07:14 +0530 Subject: [PATCH] Adds restaurant parser --- parse_restaurants.rb | 97 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/parse_restaurants.rb b/parse_restaurants.rb index f1ae60b..a2a74c7 100644 --- a/parse_restaurants.rb +++ b/parse_restaurants.rb @@ -1,4 +1,99 @@ require 'nokogiri' +require 'date' +require 'cgi' +require 'uri' +require 'json' require "csv" -restaurants = [] \ No newline at end of file +restaurants = [] + +csv = CSV.open("data/#{Date.today.to_s}-all.csv", 'w') + +# restaurants = CSV.read("data/#{Date.today.to_s}.csv", headers: true) +# restaurants_map = {} + +# restaurants.each do |row| +# begin +# url = URI.parse(row['url']) +# rescue URI::InvalidURIError +# url = URI.parse(CGI.escape(row['url'])) +# end +# id = row['url'].split('/').last +# restaurants_map[url.path.strip] = { +# 'url': url, +# 'title': row['title'], +# 'location':row['location'], +# 'address':row['address'] +# } +# end + +# def join_props(h, key, data) +# data.keys.each do |k| +# h["#{key}.#{k}"] = data[k] +# end +# end + +first = true +Dir.glob('html/restaurants/*.html') do |file| + # puts file + url = nil + rest = Hash.new + File.readlines(file).each do |l| + if l.match /__PRELOADED_STATE__/ + start = l.index("JSON.parse(") + "JSON.parse(".length + end_ = -3 + data = JSON.parse(JSON.parse l[start..end_]) + url = data['pages']['current']['pageUrl'].strip + r = data['pages']['restaurant'][data['pages']['restaurant'].keys.first] + if data['pages']['current']['name'] == '404' + next + end + rest['rating'] = r['sections']['SECTION_BASIC_INFO']['rating']['aggregate_rating'] + rest['rating_text'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_text'] + rest['rating_subtitle'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_subtitle'] + rest['rating_votes'] = r['sections']['SECTION_BASIC_INFO']['rating']['votes'] + rest["is_delivery_only"] = r['sections']["SECTION_BASIC_INFO"]["is_delivery_only"] + rest["name"] = data['pages']['current']['pageTitle'].strip + rest["is_perm_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_perm_closed"] + rest["is_temp_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_temp_closed"] + rest["is_opening_soon"] = r['sections']["SECTION_BASIC_INFO"]["is_opening_soon"] + rest["should_ban_ugc"] = r['sections']["SECTION_BASIC_INFO"]["should_ban_ugc"] + rest["is_shelled"] = r['sections']["SECTION_BASIC_INFO"]["is_shelled"] + rest["disclaimer_text"] = r['sections']["SECTION_BASIC_INFO"]["disclaimer_text"].strip + rest["cuisines"] = r['sections']["SECTION_RES_HEADER_DETAILS"]["CUISINES"].map{|e| e['name']}.join(', ').strip + ratings = r['sections']['SECTION_RATING_HISTOGRAM'] + h = ratings['histogram'] if ratings and ratings.has_key? 'histogram' + # Ratings with 5,4,3,2,1 stars + rest['rating_5_count'] = h ? h[0]['value'] : 0 + rest['rating_4_count'] = h ? h[1]['value'] : 0 + rest['rating_3_count'] = h ? h[2]['value'] : 0 + rest['rating_2_count'] = h ? h[3]['value'] : 0 + rest['rating_1_count'] = h ? h[4]['value'] : 0 + rest["zipcode"] = r['sections']['SECTION_RES_CONTACT']["zipcode"] + rest["is_dark_kitchen"] = r['sections']['SECTION_RES_CONTACT']["is_dark_kitchen"] + rest["locality_verbose"] = r['sections']['SECTION_RES_CONTACT']["locality_verbose"].strip + rest["latitude"] = r['sections']['SECTION_RES_CONTACT']["latitude"] + rest["longitude"] = r['sections']['SECTION_RES_CONTACT']["longitude"] + rest["address"] = r['sections']['SECTION_RES_CONTACT']["address"].strip + rest["contact"] = r['sections']['SECTION_RES_CONTACT']["is_phone_available"] ? r['sections']['SECTION_RES_CONTACT']["phoneDetails"]["phoneStr"] : "" + if r['sections']['SECTION_RES_CONTACT']["res_chain_text"].empty? + rest["chain_count"] = 0 + else + rest["chain_count"] = r['sections']['SECTION_RES_CONTACT']["res_chain_text"].match(/\d+/)[0] + end + rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text'] + if ratings and ratings.has_key? 'rating_streak' + rest['last_rating_timestamp'] = ratings['rating_streak'][-1]['timestamp'] + else + rest['last_rating_timestamp'] = Date.new(2020, 01,01).strftime('%s') + end + rest['id'] = data['pages']['current']['resId'] + if first + csv << rest.keys + first = false + end + csv << rest.values + end + end + +end \ No newline at end of file