From 39f4adfc55aa4a34ab4bb047e1411c7816f1bf2a Mon Sep 17 00:00:00 2001 From: Nemo Date: Tue, 2 Jan 2018 00:45:51 +0530 Subject: [PATCH] Adds parser and CSV generator --- .gitignore | 3 ++- README.md | 2 +- bootstrap.sh | 2 ++ parse_restaurant.rb | 26 ++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 parse_restaurant.rb diff --git a/.gitignore b/.gitignore index 681c030..36943b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ vendor/ -html/ \ No newline at end of file +html/ +*.csv \ No newline at end of file diff --git a/README.md b/README.md index e9b243e..8ce9843 100644 --- a/README.md +++ b/README.md @@ -11,4 +11,4 @@ Zomato does not support HTTP/1.1, so wget can't be used. - Keep track of historical data - Does not use the API (since the rate-limit is too low at 1k/day) - + We need to checkou around 8k restaurant status \ No newline at end of file + + We need to checkout around 8k restaurant status \ No newline at end of file diff --git a/bootstrap.sh b/bootstrap.sh index af1da0e..f1e1b1c 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -18,3 +18,5 @@ while read -r LINK; do FILENAME="$(basename $LINK).html" dl "$LINK" "$FILENAME" done <<< $(bundle exec ruby parse_dir.rb) + +bundle exec ruby parse_restaurant.rb \ No newline at end of file diff --git a/parse_restaurant.rb b/parse_restaurant.rb new file mode 100644 index 0000000..1c2e0b0 --- /dev/null +++ b/parse_restaurant.rb @@ -0,0 +1,26 @@ +require 'nokogiri' +require "csv" + +restaurants = [] + +CSV.open("database.csv", "wb") do |csv| + csv << ["url", "title", "location", "address", "cuisine"] + Dir.glob("html/restaurants-*.html") do |file| + page = Nokogiri::HTML(open(file)) + + page.css('.plr10').each do |div| + links = div.css('a') + spans = div.css('span') + + + title = links[0].text + location = links[1].text + address = spans[1].text + cuisine = spans[0].text + url = links[0]['href'] + + csv << [url, title, location, address, cuisine] + + end + end +end \ No newline at end of file