Compare commits
8 Commits
Author | SHA1 | Date |
---|---|---|
Nemo | 5921640da3 | |
Nemo | 1e67eb8281 | |
Nemo | 48adab4201 | |
Nemo | efba5ac0cf | |
Nemo | 1e20e93cc7 | |
Nemo | a73b58f802 | |
Nemo | dd95ae348a | |
Nemo | a91c242951 |
|
@ -1,3 +1,4 @@
|
|||
vendor/
|
||||
html/
|
||||
*.csv
|
||||
data/
|
|
@ -1,9 +1,9 @@
|
|||
GEM
|
||||
remote: https://rubygems.org/
|
||||
specs:
|
||||
mini_portile2 (2.3.0)
|
||||
nokogiri (1.8.1)
|
||||
mini_portile2 (~> 2.3.0)
|
||||
mini_portile2 (2.4.0)
|
||||
nokogiri (1.10.10)
|
||||
mini_portile2 (~> 2.4.0)
|
||||
|
||||
PLATFORMS
|
||||
ruby
|
||||
|
@ -12,4 +12,4 @@ DEPENDENCIES
|
|||
nokogiri
|
||||
|
||||
BUNDLED WITH
|
||||
1.16.1
|
||||
2.1.4
|
||||
|
|
|
@ -14,11 +14,10 @@ This project uses GNU Parallel, Ruby, Nokogiri, and curl.
|
|||
|
||||
- Keep track of historical data using regularly generated CSV files
|
||||
- Does not use the API (since the rate-limit is too low at 1k/day)
|
||||
+ We need to checkout around 15k restaurant status (closed or not)
|
||||
- We need to checkout around 15k restaurant status (closed or not)
|
||||
- Keep track of whether restaurant is still alive or not
|
||||
- Tweet any restaurant closures (or any new openings)
|
||||
|
||||
|
||||
For now, run the following command to get a diff of new restaurants not in the old listings:
|
||||
|
||||
`q -d , "SELECT * from ./15-01-2018.csv WHERE c1 not in (SELECT c1 from 06-01-2018.csv)"`
|
||||
`q -d , "SELECT * from ./2018-MM-DD.csv WHERE c1 not in (SELECT c1 from 2018-MM-DD.csv)"`
|
||||
|
|
|
@ -9,7 +9,7 @@ mkdir -p html/restaurants
|
|||
|
||||
function dl_z() {
|
||||
echo "[+] $2"
|
||||
curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
|
||||
[ ! -f "html/$2" ] && curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
|
||||
}
|
||||
|
||||
export -f dl_z
|
||||
|
@ -26,9 +26,9 @@ done <<< $(bundle exec ruby parse_dir.rb)
|
|||
while read -r LINK; do
|
||||
FILENAME="$(basename $LINK).html"
|
||||
echo $FILENAME
|
||||
# sem -j 30 --timeout 300% dl_z "$LINK" "restaurants/$FILENAME"
|
||||
sem -j 30 --timeout 300% dl_z "$LINK" "restaurants/$FILENAME"
|
||||
done <<< $(bundle exec ruby parse_listing.rb)
|
||||
|
||||
# bundle exec ruby parse_restaurant.rb
|
||||
bundle exec ruby parse_restaurant.rb
|
||||
|
||||
# sem --wait
|
||||
sem --wait
|
|
@ -4,25 +4,9 @@ require 'date'
|
|||
|
||||
restaurants = []
|
||||
|
||||
CSV.open("#{Date.today.to_s}.csv", 'wb') do |csv|
|
||||
csv << ['url', 'title', 'location', 'address', 'cuisine']
|
||||
Dir.glob('html/restaurants-*.html') do |file|
|
||||
Dir.glob('html/restaurants-*.html') do |file|
|
||||
page = Nokogiri::HTML(open(file))
|
||||
|
||||
page.css('.plr10').each do |div|
|
||||
links = div.css('a')
|
||||
spans = div.css('span')
|
||||
|
||||
|
||||
title = links[0].text
|
||||
location = links[1].text
|
||||
address = spans[1].text
|
||||
cuisine = spans[0].text
|
||||
url = links[0]['href']
|
||||
|
||||
csv << [url, title, location, address, cuisine]
|
||||
|
||||
puts url
|
||||
end
|
||||
puts div.css('a')[0]['href']
|
||||
end
|
||||
end
|
|
@ -1,4 +1,77 @@
|
|||
require 'nokogiri'
|
||||
require 'date'
|
||||
require 'cgi'
|
||||
require 'uri'
|
||||
require 'json'
|
||||
require "csv"
|
||||
|
||||
restaurants = []
|
||||
|
||||
csv = CSV.open("data/#{Date.today.to_s}.csv", 'w')
|
||||
|
||||
first = true
|
||||
Dir.glob('html/restaurants/*.html') do |file|
|
||||
# puts file
|
||||
url = nil
|
||||
rest = Hash.new
|
||||
File.readlines(file).each do |l|
|
||||
if l.match /__PRELOADED_STATE__/
|
||||
start = l.index("JSON.parse(") + "JSON.parse(".length
|
||||
end_ = -3
|
||||
data = JSON.parse(JSON.parse l[start..end_])
|
||||
url = data['pages']['current']['pageUrl'].strip
|
||||
r = data['pages']['restaurant'][data['pages']['restaurant'].keys.first]
|
||||
if data['pages']['current']['name'] == '404'
|
||||
next
|
||||
end
|
||||
rest['rating'] = r['sections']['SECTION_BASIC_INFO']['rating']['aggregate_rating']
|
||||
rest['rating_text'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_text']
|
||||
rest['rating_subtitle'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_subtitle']
|
||||
rest['rating_votes'] = r['sections']['SECTION_BASIC_INFO']['rating']['votes']
|
||||
rest["is_delivery_only"] = r['sections']["SECTION_BASIC_INFO"]["is_delivery_only"]
|
||||
rest["name"] = data['pages']['current']['pageTitle'].strip
|
||||
rest["is_perm_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_perm_closed"]
|
||||
rest["is_temp_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_temp_closed"]
|
||||
rest["is_opening_soon"] = r['sections']["SECTION_BASIC_INFO"]["is_opening_soon"]
|
||||
rest["should_ban_ugc"] = r['sections']["SECTION_BASIC_INFO"]["should_ban_ugc"]
|
||||
rest["is_shelled"] = r['sections']["SECTION_BASIC_INFO"]["is_shelled"]
|
||||
rest["disclaimer_text"] = r['sections']["SECTION_BASIC_INFO"]["disclaimer_text"].strip
|
||||
rest["cuisines"] = r['sections']["SECTION_RES_HEADER_DETAILS"]["CUISINES"].map{|e| e['name']}.join(', ').strip
|
||||
ratings = r['sections']['SECTION_RATING_HISTOGRAM']
|
||||
h = ratings['histogram'] if ratings and ratings.has_key? 'histogram'
|
||||
# Ratings with 5,4,3,2,1 stars
|
||||
rest['rating_5_count'] = h ? h[0]['value'] : 0
|
||||
rest['rating_4_count'] = h ? h[1]['value'] : 0
|
||||
rest['rating_3_count'] = h ? h[2]['value'] : 0
|
||||
rest['rating_2_count'] = h ? h[3]['value'] : 0
|
||||
rest['rating_1_count'] = h ? h[4]['value'] : 0
|
||||
rest["zipcode"] = r['sections']['SECTION_RES_CONTACT']["zipcode"]
|
||||
rest["is_dark_kitchen"] = r['sections']['SECTION_RES_CONTACT']["is_dark_kitchen"]
|
||||
rest["locality_verbose"] = r['sections']['SECTION_RES_CONTACT']["locality_verbose"].strip
|
||||
rest["latitude"] = r['sections']['SECTION_RES_CONTACT']["latitude"]
|
||||
rest["longitude"] = r['sections']['SECTION_RES_CONTACT']["longitude"]
|
||||
rest["address"] = r['sections']['SECTION_RES_CONTACT']["address"].strip
|
||||
rest["contact"] = r['sections']['SECTION_RES_CONTACT']["is_phone_available"] ? r['sections']['SECTION_RES_CONTACT']["phoneDetails"]["phoneStr"] : ""
|
||||
if r['sections']['SECTION_RES_CONTACT']["res_chain_text"].empty?
|
||||
rest["chain_count"] = 0
|
||||
else
|
||||
rest["chain_count"] = r['sections']['SECTION_RES_CONTACT']["res_chain_text"].match(/\d+/)[0]
|
||||
end
|
||||
rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text']
|
||||
if ratings and ratings.has_key? 'rating_streak'
|
||||
rest['last_rating_timestamp'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m-%d')
|
||||
rest['last_rating_month'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m')
|
||||
else
|
||||
rest['last_rating_timestamp'] = '1970-01-01'
|
||||
rest['last_rating_month'] = '1970-01'
|
||||
end
|
||||
rest['id'] = data['pages']['current']['resId']
|
||||
if first
|
||||
csv << rest.keys
|
||||
first = false
|
||||
end
|
||||
csv << rest.values
|
||||
end
|
||||
end
|
||||
|
||||
end
|
|
@ -0,0 +1,37 @@
|
|||
require 'csv'
|
||||
require 'set'
|
||||
|
||||
data = {}
|
||||
locales = Set.new
|
||||
|
||||
Dir.glob("data/*.csv") do |file|
|
||||
date_key = File.basename file, '.csv'
|
||||
data[date_key] = {}
|
||||
puts file
|
||||
begin
|
||||
CSV.foreach(file, headers: true) do |row|
|
||||
locale = row[2]
|
||||
if locale[-11..-1] == ", Bangalore"
|
||||
locale = locale[0..-12]
|
||||
end
|
||||
locales << locale
|
||||
data[date_key][locale]||=0
|
||||
data[date_key][locale]+=1
|
||||
end
|
||||
rescue Exception => e
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
locales = locales.to_a.sort
|
||||
CSV.open("stats.csv", "wb") do |csv|
|
||||
csv << ["date"].concat(locales)
|
||||
data.each do |date, census|
|
||||
d = [date]
|
||||
locales.each do |l|
|
||||
locale_count = census[l]||0
|
||||
d << locale_count
|
||||
end
|
||||
csv << d
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue