From 48adab420101072ce3587723545d076357fb1f3c Mon Sep 17 00:00:00 2001
From: Nemo <me@captnemo.in>
Date: Thu, 30 Jul 2020 03:07:14 +0530
Subject: [PATCH] Adds restaurant parser

---
 parse_restaurants.rb | 97 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/parse_restaurants.rb b/parse_restaurants.rb
index f1ae60b..a2a74c7 100644
--- a/parse_restaurants.rb
+++ b/parse_restaurants.rb
@@ -1,4 +1,99 @@
 require 'nokogiri'
+require 'date'
+require 'cgi'
+require 'uri'
+require 'json'
 require "csv"
 
-restaurants = []
\ No newline at end of file
+restaurants = []
+
+csv = CSV.open("data/#{Date.today.to_s}-all.csv", 'w')
+
+# restaurants = CSV.read("data/#{Date.today.to_s}.csv", headers: true)
+# restaurants_map = {}
+
+# restaurants.each do |row|
+#   begin
+#     url = URI.parse(row['url'])
+#   rescue URI::InvalidURIError
+#     url = URI.parse(CGI.escape(row['url']))
+#   end
+#   id = row['url'].split('/').last
+#   restaurants_map[url.path.strip] = {
+#     'url': url,
+#     'title': row['title'],
+#     'location':row['location'],
+#     'address':row['address']
+#   }
+# end
+
+# def join_props(h, key, data)
+#   data.keys.each do |k|
+#     h["#{key}.#{k}"] = data[k]
+#   end
+# end
+
+first = true
+Dir.glob('html/restaurants/*.html') do |file|
+  # puts file
+  url = nil
+  rest = Hash.new
+  File.readlines(file).each do |l|
+    if l.match /__PRELOADED_STATE__/
+      start = l.index("JSON.parse(") + "JSON.parse(".length
+      end_ = -3
+      data = JSON.parse(JSON.parse l[start..end_])
+      url = data['pages']['current']['pageUrl'].strip
+      r = data['pages']['restaurant'][data['pages']['restaurant'].keys.first]
+      if data['pages']['current']['name'] == '404'
+        next
+      end
+      rest['rating'] = r['sections']['SECTION_BASIC_INFO']['rating']['aggregate_rating']
+      rest['rating_text'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_text']
+      rest['rating_subtitle'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_subtitle']
+      rest['rating_votes'] = r['sections']['SECTION_BASIC_INFO']['rating']['votes']
+      rest["is_delivery_only"] =  r['sections']["SECTION_BASIC_INFO"]["is_delivery_only"]
+      rest["name"] = data['pages']['current']['pageTitle'].strip
+      rest["is_perm_closed"] =  r['sections']["SECTION_BASIC_INFO"]["is_perm_closed"]
+      rest["is_temp_closed"] =  r['sections']["SECTION_BASIC_INFO"]["is_temp_closed"]
+      rest["is_opening_soon"] =  r['sections']["SECTION_BASIC_INFO"]["is_opening_soon"]
+      rest["should_ban_ugc"] =  r['sections']["SECTION_BASIC_INFO"]["should_ban_ugc"]
+      rest["is_shelled"] =  r['sections']["SECTION_BASIC_INFO"]["is_shelled"]
+      rest["disclaimer_text"] = r['sections']["SECTION_BASIC_INFO"]["disclaimer_text"].strip
+      rest["cuisines"] = r['sections']["SECTION_RES_HEADER_DETAILS"]["CUISINES"].map{|e| e['name']}.join(', ').strip
+      ratings = r['sections']['SECTION_RATING_HISTOGRAM']
+      h = ratings['histogram'] if ratings and ratings.has_key? 'histogram'
+      # Ratings with 5,4,3,2,1 stars
+      rest['rating_5_count'] = h ? h[0]['value'] : 0
+      rest['rating_4_count'] = h ? h[1]['value'] : 0
+      rest['rating_3_count'] = h ? h[2]['value'] : 0
+      rest['rating_2_count'] = h ? h[3]['value'] : 0
+      rest['rating_1_count'] = h ? h[4]['value'] : 0
+      rest["zipcode"] = r['sections']['SECTION_RES_CONTACT']["zipcode"]
+      rest["is_dark_kitchen"] = r['sections']['SECTION_RES_CONTACT']["is_dark_kitchen"]
+      rest["locality_verbose"] = r['sections']['SECTION_RES_CONTACT']["locality_verbose"].strip
+      rest["latitude"] = r['sections']['SECTION_RES_CONTACT']["latitude"]
+      rest["longitude"] = r['sections']['SECTION_RES_CONTACT']["longitude"]
+      rest["address"] = r['sections']['SECTION_RES_CONTACT']["address"].strip
+      rest["contact"] = r['sections']['SECTION_RES_CONTACT']["is_phone_available"] ? r['sections']['SECTION_RES_CONTACT']["phoneDetails"]["phoneStr"] : ""
+      if r['sections']['SECTION_RES_CONTACT']["res_chain_text"].empty?
+        rest["chain_count"] = 0
+      else
+        rest["chain_count"] = r['sections']['SECTION_RES_CONTACT']["res_chain_text"].match(/\d+/)[0]
+      end
+      rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text']
+      if ratings and ratings.has_key? 'rating_streak'
+        rest['last_rating_timestamp'] = ratings['rating_streak'][-1]['timestamp']
+      else
+        rest['last_rating_timestamp'] = Date.new(2020, 01,01).strftime('%s')
+      end
+      rest['id'] = data['pages']['current']['resId']
+      if first
+        csv << rest.keys
+        first = false
+      end
+      csv << rest.values
+    end
+  end
+
+end
\ No newline at end of file