require 'nokogiri' require 'date' require 'cgi' require 'uri' require 'json' require "csv" restaurants = [] csv = CSV.open("data/#{Date.today.to_s}.csv", 'w') first = true Dir.glob('html/restaurants/*.html') do |file| # puts file url = nil rest = Hash.new File.readlines(file).each do |l| if l.match /__PRELOADED_STATE__/ start = l.index("JSON.parse(") + "JSON.parse(".length end_ = -3 data = JSON.parse(JSON.parse l[start..end_]) url = data['pages']['current']['pageUrl'].strip r = data['pages']['restaurant'][data['pages']['restaurant'].keys.first] if data['pages']['current']['name'] == '404' next end rest['rating'] = r['sections']['SECTION_BASIC_INFO']['rating']['aggregate_rating'] rest['rating_text'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_text'] rest['rating_subtitle'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_subtitle'] rest['rating_votes'] = r['sections']['SECTION_BASIC_INFO']['rating']['votes'] rest["is_delivery_only"] = r['sections']["SECTION_BASIC_INFO"]["is_delivery_only"] rest["name"] = data['pages']['current']['pageTitle'].strip rest["is_perm_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_perm_closed"] rest["is_temp_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_temp_closed"] rest["is_opening_soon"] = r['sections']["SECTION_BASIC_INFO"]["is_opening_soon"] rest["should_ban_ugc"] = r['sections']["SECTION_BASIC_INFO"]["should_ban_ugc"] rest["is_shelled"] = r['sections']["SECTION_BASIC_INFO"]["is_shelled"] rest["disclaimer_text"] = r['sections']["SECTION_BASIC_INFO"]["disclaimer_text"].strip rest["cuisines"] = r['sections']["SECTION_RES_HEADER_DETAILS"]["CUISINES"].map{|e| e['name']}.join(', ').strip ratings = r['sections']['SECTION_RATING_HISTOGRAM'] h = ratings['histogram'] if ratings and ratings.has_key? 'histogram' # Ratings with 5,4,3,2,1 stars rest['rating_5_count'] = h ? h[0]['value'] : 0 rest['rating_4_count'] = h ? h[1]['value'] : 0 rest['rating_3_count'] = h ? h[2]['value'] : 0 rest['rating_2_count'] = h ? h[3]['value'] : 0 rest['rating_1_count'] = h ? h[4]['value'] : 0 rest["zipcode"] = r['sections']['SECTION_RES_CONTACT']["zipcode"] rest["is_dark_kitchen"] = r['sections']['SECTION_RES_CONTACT']["is_dark_kitchen"] rest["locality_verbose"] = r['sections']['SECTION_RES_CONTACT']["locality_verbose"].strip rest["latitude"] = r['sections']['SECTION_RES_CONTACT']["latitude"] rest["longitude"] = r['sections']['SECTION_RES_CONTACT']["longitude"] rest["address"] = r['sections']['SECTION_RES_CONTACT']["address"].strip rest["contact"] = r['sections']['SECTION_RES_CONTACT']["is_phone_available"] ? r['sections']['SECTION_RES_CONTACT']["phoneDetails"]["phoneStr"] : "" if r['sections']['SECTION_RES_CONTACT']["res_chain_text"].empty? rest["chain_count"] = 0 else rest["chain_count"] = r['sections']['SECTION_RES_CONTACT']["res_chain_text"].match(/\d+/)[0] end rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text'] if ratings and ratings.has_key? 'rating_streak' rest['last_rating_timestamp'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m-%d') rest['last_rating_month'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m') else rest['last_rating_timestamp'] = '1970-01-01' rest['last_rating_month'] = '1970-01' end rest['id'] = data['pages']['current']['resId'] if first csv << rest.keys first = false end csv << rest.values end end end