Compare commits

...

14 Commits

Author SHA1 Message Date
Nemo 5921640da3 Now, although it takes 5GB to run this once, we get a more detailed CSV 2020-07-30 03:48:57 +05:30
Nemo 1e67eb8281 Remove commented code 2020-07-30 03:07:27 +05:30
Nemo 48adab4201 Adds restaurant parser 2020-07-30 03:07:14 +05:30
Nemo efba5ac0cf Run parallel and only download missing files 2020-07-29 22:46:39 +05:30
Nemo 1e20e93cc7 Upgrade bundler 2020-07-29 22:36:33 +05:30
Nemo a73b58f802 Adds stats script 2018-12-25 19:58:04 +05:30
Nemo dd95ae348a Fixes query command 2018-04-08 23:01:58 +05:30
Nemo a91c242951 Move to data/ directory 2018-04-08 23:01:29 +05:30
Nemo 272053df38 write file to date 2018-03-30 13:50:03 +05:30
Nemo 3a0545d053 How to get a diff 2018-01-15 02:08:10 +05:30
Nemo ecc340e641 Minor style changes 2018-01-15 02:01:40 +05:30
Nemo 92b3175ce0 Minor changes 2018-01-06 19:37:40 +05:30
Nemo 42cb68468c Removes unused gems 2018-01-02 07:04:08 +05:30
Nemo 4d333fbcbf Update restaurant count 2018-01-02 01:18:29 +05:30
9 changed files with 152 additions and 48 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
vendor/
html/
*.csv
*.csv
data/

View File

@ -1,4 +1,3 @@
source 'https://rubygems.org'
gem 'net-http2'
gem 'nokogiri'

View File

@ -1,19 +1,15 @@
GEM
remote: https://rubygems.org/
specs:
http-2 (0.8.4)
mini_portile2 (2.3.0)
net-http2 (0.16.0)
http-2 (= 0.8.4)
nokogiri (1.8.1)
mini_portile2 (~> 2.3.0)
mini_portile2 (2.4.0)
nokogiri (1.10.10)
mini_portile2 (~> 2.4.0)
PLATFORMS
ruby
DEPENDENCIES
net-http2
nokogiri
BUNDLED WITH
1.16.1
2.1.4

View File

@ -4,7 +4,7 @@ Keep track of restaurant openings and closures in the city.
# Quirks
- Zomato does not support HTTP/1.1, so wget can't be used.
- Zomato does not support HTTP/1.1, so wget can't be used.
# Tech
@ -12,8 +12,12 @@ This project uses GNU Parallel, Ruby, Nokogiri, and curl.
# Features
- Keep track of historical data using regularly generated CSV files
- Does not use the API (since the rate-limit is too low at 1k/day)
+ We need to checkout around 8k restaurant status
- Keep track of whether restaurant is still alive or not
- Tweet any restaurant closures (or any new openings)
- Keep track of historical data using regularly generated CSV files
- Does not use the API (since the rate-limit is too low at 1k/day)
- We need to checkout around 15k restaurant status (closed or not)
- Keep track of whether restaurant is still alive or not
- Tweet any restaurant closures (or any new openings)
For now, run the following command to get a diff of new restaurants not in the old listings:
`q -d , "SELECT * from ./2018-MM-DD.csv WHERE c1 not in (SELECT c1 from 2018-MM-DD.csv)"`

View File

@ -1,6 +1,6 @@
#!/bin/bash
ZOMATO_ROOT_URL=https://www.zomato.com
ZOMATO_ROOT_URL="https://www.zomato.com"
ZOMATO_CITY=bangalore
DIRECTORY_URL="$ZOMATO_ROOT_URL/$ZOMATO_CITY/directory"
USER_AGENT="Mozilla/Gecko/Firefox/58.0"
@ -9,21 +9,26 @@ mkdir -p html/restaurants
function dl_z() {
echo "[+] $2"
curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
[ ! -f "html/$2" ] && curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
}
export -f dl_z
dl "$DIRECTORY_URL" "directory.html"
dl_z "$DIRECTORY_URL" "directory.html"
# Download all the listing pages
while read -r LINK; do
FILENAME="$(basename $LINK).html"
dl "$LINK" "$FILENAME"
dl_z "$LINK" "$FILENAME"
done <<< $(bundle exec ruby parse_dir.rb)
# Download all the restaurant pages (~15k)
while read -r LINK; do
FILENAME="$(basename $LINK).html"
echo $FILENAME
sem -j 30 --timeout 300% dl_z "$LINK" "restaurants/$FILENAME"
done <<< $(bundle exec ruby parse_restaurant.rb)
done <<< $(bundle exec ruby parse_listing.rb)
bundle exec ruby parse_restaurant.rb
sem --wait

12
parse_listing.rb Normal file
View File

@ -0,0 +1,12 @@
require 'nokogiri'
require 'csv'
require 'date'
restaurants = []
Dir.glob('html/restaurants-*.html') do |file|
page = Nokogiri::HTML(open(file))
page.css('.plr10').each do |div|
puts div.css('a')[0]['href']
end
end

View File

@ -1,27 +0,0 @@
require 'nokogiri'
require "csv"
restaurants = []
CSV.open("database.csv", "wb") do |csv|
csv << ["url", "title", "location", "address", "cuisine"]
Dir.glob("html/restaurants-*.html") do |file|
page = Nokogiri::HTML(open(file))
page.css('.plr10').each do |div|
links = div.css('a')
spans = div.css('span')
title = links[0].text
location = links[1].text
address = spans[1].text
cuisine = spans[0].text
url = links[0]['href']
csv << [url, title, location, address, cuisine]
puts url
end
end
end

77
parse_restaurants.rb Normal file
View File

@ -0,0 +1,77 @@
require 'nokogiri'
require 'date'
require 'cgi'
require 'uri'
require 'json'
require "csv"
restaurants = []
csv = CSV.open("data/#{Date.today.to_s}.csv", 'w')
first = true
Dir.glob('html/restaurants/*.html') do |file|
# puts file
url = nil
rest = Hash.new
File.readlines(file).each do |l|
if l.match /__PRELOADED_STATE__/
start = l.index("JSON.parse(") + "JSON.parse(".length
end_ = -3
data = JSON.parse(JSON.parse l[start..end_])
url = data['pages']['current']['pageUrl'].strip
r = data['pages']['restaurant'][data['pages']['restaurant'].keys.first]
if data['pages']['current']['name'] == '404'
next
end
rest['rating'] = r['sections']['SECTION_BASIC_INFO']['rating']['aggregate_rating']
rest['rating_text'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_text']
rest['rating_subtitle'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_subtitle']
rest['rating_votes'] = r['sections']['SECTION_BASIC_INFO']['rating']['votes']
rest["is_delivery_only"] = r['sections']["SECTION_BASIC_INFO"]["is_delivery_only"]
rest["name"] = data['pages']['current']['pageTitle'].strip
rest["is_perm_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_perm_closed"]
rest["is_temp_closed"] = r['sections']["SECTION_BASIC_INFO"]["is_temp_closed"]
rest["is_opening_soon"] = r['sections']["SECTION_BASIC_INFO"]["is_opening_soon"]
rest["should_ban_ugc"] = r['sections']["SECTION_BASIC_INFO"]["should_ban_ugc"]
rest["is_shelled"] = r['sections']["SECTION_BASIC_INFO"]["is_shelled"]
rest["disclaimer_text"] = r['sections']["SECTION_BASIC_INFO"]["disclaimer_text"].strip
rest["cuisines"] = r['sections']["SECTION_RES_HEADER_DETAILS"]["CUISINES"].map{|e| e['name']}.join(', ').strip
ratings = r['sections']['SECTION_RATING_HISTOGRAM']
h = ratings['histogram'] if ratings and ratings.has_key? 'histogram'
# Ratings with 5,4,3,2,1 stars
rest['rating_5_count'] = h ? h[0]['value'] : 0
rest['rating_4_count'] = h ? h[1]['value'] : 0
rest['rating_3_count'] = h ? h[2]['value'] : 0
rest['rating_2_count'] = h ? h[3]['value'] : 0
rest['rating_1_count'] = h ? h[4]['value'] : 0
rest["zipcode"] = r['sections']['SECTION_RES_CONTACT']["zipcode"]
rest["is_dark_kitchen"] = r['sections']['SECTION_RES_CONTACT']["is_dark_kitchen"]
rest["locality_verbose"] = r['sections']['SECTION_RES_CONTACT']["locality_verbose"].strip
rest["latitude"] = r['sections']['SECTION_RES_CONTACT']["latitude"]
rest["longitude"] = r['sections']['SECTION_RES_CONTACT']["longitude"]
rest["address"] = r['sections']['SECTION_RES_CONTACT']["address"].strip
rest["contact"] = r['sections']['SECTION_RES_CONTACT']["is_phone_available"] ? r['sections']['SECTION_RES_CONTACT']["phoneDetails"]["phoneStr"] : ""
if r['sections']['SECTION_RES_CONTACT']["res_chain_text"].empty?
rest["chain_count"] = 0
else
rest["chain_count"] = r['sections']['SECTION_RES_CONTACT']["res_chain_text"].match(/\d+/)[0]
end
rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text']
if ratings and ratings.has_key? 'rating_streak'
rest['last_rating_timestamp'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m-%d')
rest['last_rating_month'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m')
else
rest['last_rating_timestamp'] = '1970-01-01'
rest['last_rating_month'] = '1970-01'
end
rest['id'] = data['pages']['current']['resId']
if first
csv << rest.keys
first = false
end
csv << rest.values
end
end
end

37
stats.rb Normal file
View File

@ -0,0 +1,37 @@
require 'csv'
require 'set'
data = {}
locales = Set.new
Dir.glob("data/*.csv") do |file|
date_key = File.basename file, '.csv'
data[date_key] = {}
puts file
begin
CSV.foreach(file, headers: true) do |row|
locale = row[2]
if locale[-11..-1] == ", Bangalore"
locale = locale[0..-12]
end
locales << locale
data[date_key][locale]||=0
data[date_key][locale]+=1
end
rescue Exception => e
end
end
locales = locales.to_a.sort
CSV.open("stats.csv", "wb") do |csv|
csv << ["date"].concat(locales)
data.each do |date, census|
d = [date]
locales.each do |l|
locale_count = census[l]||0
d << locale_count
end
csv << d
end
end