diff --git a/README.md b/README.md index 8ce9843..c1272c3 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,18 @@ Keep track of restaurant openings and closures in the city. -# Quirk +# Quirks -Zomato does not support HTTP/1.1, so wget can't be used. +- Zomato does not support HTTP/1.1, so wget can't be used. +# Tech + +This project uses GNU Parallel, Ruby, Nokogiri, and curl. # Features -- Keep track of historical data +- Keep track of historical data using regularly generated CSV files - Does not use the API (since the rate-limit is too low at 1k/day) - + We need to checkout around 8k restaurant status \ No newline at end of file + + We need to checkout around 8k restaurant status +- Keep track of whether restaurant is still alive or not +- Tweet any restaurant closures (or any new openings) diff --git a/bootstrap.sh b/bootstrap.sh index f1e1b1c..ead4696 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -5,13 +5,15 @@ ZOMATO_CITY=bangalore DIRECTORY_URL="$ZOMATO_ROOT_URL/$ZOMATO_CITY/directory" USER_AGENT="Mozilla/Gecko/Firefox/58.0" -mkdir -p html +mkdir -p html/restaurants -function dl() { +function dl_z() { echo "[+] $2" curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2" } +export -f dl_z + dl "$DIRECTORY_URL" "directory.html" while read -r LINK; do @@ -19,4 +21,9 @@ while read -r LINK; do dl "$LINK" "$FILENAME" done <<< $(bundle exec ruby parse_dir.rb) -bundle exec ruby parse_restaurant.rb \ No newline at end of file +while read -r LINK; do + FILENAME="$(basename $LINK).html" + sem -j 30 --timeout 300% dl_z "$LINK" "restaurants/$FILENAME" +done <<< $(bundle exec ruby parse_restaurant.rb) + +sem --wait \ No newline at end of file diff --git a/parse_restaurant.rb b/parse_restaurant.rb index 1c2e0b0..8382f64 100644 --- a/parse_restaurant.rb +++ b/parse_restaurant.rb @@ -21,6 +21,7 @@ CSV.open("database.csv", "wb") do |csv| csv << [url, title, location, address, cuisine] + puts url end end end \ No newline at end of file