bengaluru-food-census/bootstrap.sh

34 lines
861 B
Bash
Raw Permalink Normal View History

2018-01-01 18:49:07 +00:00
#!/bin/bash
2018-01-02 01:34:08 +00:00
ZOMATO_ROOT_URL="https://www.zomato.com"
2018-01-01 18:49:07 +00:00
ZOMATO_CITY=bangalore
DIRECTORY_URL="$ZOMATO_ROOT_URL/$ZOMATO_CITY/directory"
USER_AGENT="Mozilla/Gecko/Firefox/58.0"
mkdir -p html/restaurants
2018-01-01 18:49:07 +00:00
function dl_z() {
2018-01-01 18:49:07 +00:00
echo "[+] $2"
[ ! -f "html/$2" ] && curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
2018-01-01 18:49:07 +00:00
}
export -f dl_z
2018-01-14 20:31:40 +00:00
dl_z "$DIRECTORY_URL" "directory.html"
2018-01-01 18:49:07 +00:00
2018-01-06 14:07:40 +00:00
# Download all the listing pages
2018-01-01 18:49:07 +00:00
while read -r LINK; do
FILENAME="$(basename $LINK).html"
2018-01-14 20:31:40 +00:00
dl_z "$LINK" "$FILENAME"
2018-01-01 18:49:07 +00:00
done <<< $(bundle exec ruby parse_dir.rb)
2018-01-01 19:15:51 +00:00
2018-01-06 14:07:40 +00:00
# Download all the restaurant pages (~15k)
while read -r LINK; do
FILENAME="$(basename $LINK).html"
2018-01-14 20:31:40 +00:00
echo $FILENAME
sem -j 30 --timeout 300% dl_z "$LINK" "restaurants/$FILENAME"
2018-01-06 14:07:40 +00:00
done <<< $(bundle exec ruby parse_listing.rb)
bundle exec ruby parse_restaurant.rb
sem --wait