From 99924285c510ac8f4ba8042dd3fa701e7017f56c Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 2 Jan 2018 00:19:07 +0530 Subject: [PATCH] Initial commit --- .gitignore | 2 ++ Gemfile | 4 ++++ Gemfile.lock | 19 +++++++++++++++++++ README.md | 14 ++++++++++++++ bootstrap.sh | 20 ++++++++++++++++++++ parse_dir.rb | 11 +++++++++++ 6 files changed, 70 insertions(+) create mode 100644 .gitignore create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 README.md create mode 100755 bootstrap.sh create mode 100644 parse_dir.rb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..681c030 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +vendor/ +html/ \ No newline at end of file diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..0e95724 --- /dev/null +++ b/Gemfile @@ -0,0 +1,4 @@ +source 'https://rubygems.org' + +gem 'net-http2' +gem 'nokogiri' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..5f3225d --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,19 @@ +GEM + remote: https://rubygems.org/ + specs: + http-2 (0.8.4) + mini_portile2 (2.3.0) + net-http2 (0.16.0) + http-2 (= 0.8.4) + nokogiri (1.8.1) + mini_portile2 (~> 2.3.0) + +PLATFORMS + ruby + +DEPENDENCIES + net-http2 + nokogiri + +BUNDLED WITH + 1.16.1 diff --git a/README.md b/README.md new file mode 100644 index 0000000..e9b243e --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# bengaluru-food-census + +Keep track of restaurant openings and closures in the city. + +# Quirk + +Zomato does not support HTTP/1.1, so wget can't be used. + + +# Features + +- Keep track of historical data +- Does not use the API (since the rate-limit is too low at 1k/day) + + We need to checkou around 8k restaurant status \ No newline at end of file diff --git a/bootstrap.sh b/bootstrap.sh new file mode 100755 index 0000000..af1da0e --- /dev/null +++ b/bootstrap.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +ZOMATO_ROOT_URL=https://www.zomato.com +ZOMATO_CITY=bangalore +DIRECTORY_URL="$ZOMATO_ROOT_URL/$ZOMATO_CITY/directory" +USER_AGENT="Mozilla/Gecko/Firefox/58.0" + +mkdir -p html + +function dl() { + echo "[+] $2" + curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2" +} + +dl "$DIRECTORY_URL" "directory.html" + +while read -r LINK; do + FILENAME="$(basename $LINK).html" + dl "$LINK" "$FILENAME" +done <<< $(bundle exec ruby parse_dir.rb) diff --git a/parse_dir.rb b/parse_dir.rb new file mode 100644 index 0000000..38838d2 --- /dev/null +++ b/parse_dir.rb @@ -0,0 +1,11 @@ +require 'nokogiri' + + +page = Nokogiri::HTML(open("html/directory.html")) + +page.css('a').each do |link| + + if link['href'] and link['href'][0,55] === 'https://www.zomato.com/bangalore/directory/restaurants-' + puts link['href'] + end +end \ No newline at end of file