Now, although it takes 5GB to run this once, we get a more detailed CSV

Remove commented code
Adds restaurant parser
2020-07-30 03:48:57 +05:30 · 2020-07-30 03:07:27 +05:30 · 2020-07-30 03:07:14 +05:30 · 2020-07-29 22:46:39 +05:30 · 2020-07-29 22:36:33 +05:30 · 2018-12-25 19:58:04 +05:30
7 changed files with 132 additions and 38 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 vendor/
 html/
 *.csv
+data/
--- a/Gemfile.lock
+++ b/Gemfile.lock
@ -1,9 +1,9 @@
 GEM
  remote: https://rubygems.org/
  specs:
-    mini_portile2 (2.3.0)
-    nokogiri (1.8.1)
-      mini_portile2 (~> 2.3.0)
+    mini_portile2 (2.4.0)
+    nokogiri (1.10.10)
+      mini_portile2 (~> 2.4.0)

 PLATFORMS
  ruby
@ -12,4 +12,4 @@ DEPENDENCIES
  nokogiri

 BUNDLED WITH
-   1.16.1
+   2.1.4
--- a/README.md
+++ b/README.md
@ -14,11 +14,10 @@ This project uses GNU Parallel, Ruby, Nokogiri, and curl.

 -   Keep track of historical data using regularly generated CSV files
 -   Does not use the API (since the rate-limit is too low at 1k/day)
-    + We need to checkout around 15k restaurant status (closed or not)
+    -   We need to checkout around 15k restaurant status (closed or not)
 -   Keep track of whether restaurant is still alive or not
 -   Tweet any restaurant closures (or any new openings)

-
 For now, run the following command to get a diff of new restaurants not in the old listings:

-`q -d ,  "SELECT * from ./15-01-2018.csv WHERE c1  not in (SELECT c1 from 06-01-2018.csv)"`
+`q -d , "SELECT * from ./2018-MM-DD.csv WHERE c1 not in (SELECT c1 from 2018-MM-DD.csv)"`
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -9,7 +9,7 @@ mkdir -p html/restaurants

 function dl_z() {
    echo "[+] $2"
-    curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
+    [ ! -f "html/$2" ] && curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
 }

 export -f dl_z
@ -26,9 +26,9 @@ done <<< $(bundle exec ruby parse_dir.rb)
 while read -r LINK; do
    FILENAME="$(basename $LINK).html"
    echo $FILENAME
-    # sem -j 30 --timeout 300% dl_z "$LINK" "restaurants/$FILENAME"
+    sem -j 30 --timeout 300% dl_z "$LINK" "restaurants/$FILENAME"
 done <<< $(bundle exec ruby parse_listing.rb)

-# bundle exec ruby parse_restaurant.rb
+bundle exec ruby parse_restaurant.rb

-# sem --wait
+sem --wait
--- a/parse_listing.rb
+++ b/parse_listing.rb
@ -4,25 +4,9 @@ require 'date'

 restaurants = []

-CSV.open("#{Date.today.to_s}.csv", 'wb') do |csv|
-  csv << ['url', 'title', 'location', 'address', 'cuisine']
-  Dir.glob('html/restaurants-*.html') do |file|
+Dir.glob('html/restaurants-*.html') do |file|
  page = Nokogiri::HTML(open(file))
-
  page.css('.plr10').each do |div|
-      links = div.css('a')
-      spans = div.css('span')
-
-
-      title = links[0].text
-      location = links[1].text
-      address = spans[1].text
-      cuisine = spans[0].text
-      url = links[0]['href']
-
-      csv << [url, title, location, address, cuisine]
-
-      puts url
-    end
+    puts div.css('a')[0]['href']
  end
 end
--- a/parse_restaurants.rb
+++ b/parse_restaurants.rb
@ -1,4 +1,77 @@
 require 'nokogiri'
+require 'date'
+require 'cgi'
+require 'uri'
+require 'json'
 require "csv"

 restaurants = []
+
+csv = CSV.open("data/#{Date.today.to_s}.csv", 'w')
+
+first = true
+Dir.glob('html/restaurants/*.html') do |file|
+  # puts file
+  url = nil
+  rest = Hash.new
+  File.readlines(file).each do |l|
+    if l.match /__PRELOADED_STATE__/
+      start = l.index("JSON.parse(") + "JSON.parse(".length
+      end_ = -3
+      data = JSON.parse(JSON.parse l[start..end_])
+      url = data['pages']['current']['pageUrl'].strip
+      r = data['pages']['restaurant'][data['pages']['restaurant'].keys.first]
+      if data['pages']['current']['name'] == '404'
+        next
+      end
+      rest['rating'] = r['sections']['SECTION_BASIC_INFO']['rating']['aggregate_rating']
+      rest['rating_text'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_text']
+      rest['rating_subtitle'] = r['sections']['SECTION_BASIC_INFO']['rating']['rating_subtitle']
+      rest['rating_votes'] = r['sections']['SECTION_BASIC_INFO']['rating']['votes']
+      rest["is_delivery_only"] =  r['sections']["SECTION_BASIC_INFO"]["is_delivery_only"]
+      rest["name"] = data['pages']['current']['pageTitle'].strip
+      rest["is_perm_closed"] =  r['sections']["SECTION_BASIC_INFO"]["is_perm_closed"]
+      rest["is_temp_closed"] =  r['sections']["SECTION_BASIC_INFO"]["is_temp_closed"]
+      rest["is_opening_soon"] =  r['sections']["SECTION_BASIC_INFO"]["is_opening_soon"]
+      rest["should_ban_ugc"] =  r['sections']["SECTION_BASIC_INFO"]["should_ban_ugc"]
+      rest["is_shelled"] =  r['sections']["SECTION_BASIC_INFO"]["is_shelled"]
+      rest["disclaimer_text"] = r['sections']["SECTION_BASIC_INFO"]["disclaimer_text"].strip
+      rest["cuisines"] = r['sections']["SECTION_RES_HEADER_DETAILS"]["CUISINES"].map{|e| e['name']}.join(', ').strip
+      ratings = r['sections']['SECTION_RATING_HISTOGRAM']
+      h = ratings['histogram'] if ratings and ratings.has_key? 'histogram'
+      # Ratings with 5,4,3,2,1 stars
+      rest['rating_5_count'] = h ? h[0]['value'] : 0
+      rest['rating_4_count'] = h ? h[1]['value'] : 0
+      rest['rating_3_count'] = h ? h[2]['value'] : 0
+      rest['rating_2_count'] = h ? h[3]['value'] : 0
+      rest['rating_1_count'] = h ? h[4]['value'] : 0
+      rest["zipcode"] = r['sections']['SECTION_RES_CONTACT']["zipcode"]
+      rest["is_dark_kitchen"] = r['sections']['SECTION_RES_CONTACT']["is_dark_kitchen"]
+      rest["locality_verbose"] = r['sections']['SECTION_RES_CONTACT']["locality_verbose"].strip
+      rest["latitude"] = r['sections']['SECTION_RES_CONTACT']["latitude"]
+      rest["longitude"] = r['sections']['SECTION_RES_CONTACT']["longitude"]
+      rest["address"] = r['sections']['SECTION_RES_CONTACT']["address"].strip
+      rest["contact"] = r['sections']['SECTION_RES_CONTACT']["is_phone_available"] ? r['sections']['SECTION_RES_CONTACT']["phoneDetails"]["phoneStr"] : ""
+      if r['sections']['SECTION_RES_CONTACT']["res_chain_text"].empty?
+        rest["chain_count"] = 0
+      else
+        rest["chain_count"] = r['sections']['SECTION_RES_CONTACT']["res_chain_text"].match(/\d+/)[0]
+      end
+      rest['status'] = r['sections']['SECTION_BASIC_INFO']['res_status_text']
+      if ratings and ratings.has_key? 'rating_streak'
+        rest['last_rating_timestamp'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m-%d')
+        rest['last_rating_month'] = Time.at(ratings['rating_streak'][-1]['timestamp']).strftime('%Y-%m')
+      else
+        rest['last_rating_timestamp'] = '1970-01-01'
+        rest['last_rating_month'] = '1970-01'
+      end
+      rest['id'] = data['pages']['current']['resId']
+      if first
+        csv << rest.keys
+        first = false
+      end
+      csv << rest.values
+    end
+  end
+
+end
--- a/stats.rb
+++ b/stats.rb
@ -0,0 +1,37 @@
+require 'csv'
+require 'set'
+
+data = {}
+locales = Set.new
+
+Dir.glob("data/*.csv") do |file|
+    date_key = File.basename file, '.csv'
+    data[date_key] = {}
+    puts file
+    begin
+        CSV.foreach(file, headers: true) do |row|
+            locale = row[2]
+            if locale[-11..-1] == ", Bangalore"
+                locale = locale[0..-12]
+            end
+            locales << locale
+            data[date_key][locale]||=0
+            data[date_key][locale]+=1
+        end    
+    rescue Exception => e
+        
+    end 
+end
+
+locales = locales.to_a.sort
+CSV.open("stats.csv", "wb") do |csv|
+    csv << ["date"].concat(locales)
+    data.each do |date, census|
+        d = [date]
+        locales.each do |l|
+            locale_count = census[l]||0
+            d << locale_count
+        end
+        csv << d
+    end
+end
Author	SHA1	Message	Date
Nemo	5921640da3	Now, although it takes 5GB to run this once, we get a more detailed CSV	2020-07-30 03:48:57 +05:30
Nemo	1e67eb8281	Remove commented code	2020-07-30 03:07:27 +05:30
Nemo	48adab4201	Adds restaurant parser	2020-07-30 03:07:14 +05:30
Nemo	efba5ac0cf	Run parallel and only download missing files	2020-07-29 22:46:39 +05:30
Nemo	1e20e93cc7	Upgrade bundler	2020-07-29 22:36:33 +05:30
Nemo	a73b58f802	Adds stats script	2018-12-25 19:58:04 +05:30
Nemo	dd95ae348a	Fixes query command	2018-04-08 23:01:58 +05:30
Nemo	a91c242951	Move to data/ directory	2018-04-08 23:01:29 +05:30