2018-01-01 18:49:07 +00:00
|
|
|
#!/bin/bash
|
|
|
|
|
2018-01-02 01:34:08 +00:00
|
|
|
ZOMATO_ROOT_URL="https://www.zomato.com"
|
2018-01-01 18:49:07 +00:00
|
|
|
ZOMATO_CITY=bangalore
|
|
|
|
DIRECTORY_URL="$ZOMATO_ROOT_URL/$ZOMATO_CITY/directory"
|
|
|
|
USER_AGENT="Mozilla/Gecko/Firefox/58.0"
|
|
|
|
|
2018-01-01 19:35:00 +00:00
|
|
|
mkdir -p html/restaurants
|
2018-01-01 18:49:07 +00:00
|
|
|
|
2018-01-01 19:35:00 +00:00
|
|
|
function dl_z() {
|
2018-01-01 18:49:07 +00:00
|
|
|
echo "[+] $2"
|
|
|
|
curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
|
|
|
|
}
|
|
|
|
|
2018-01-01 19:35:00 +00:00
|
|
|
export -f dl_z
|
|
|
|
|
2018-01-14 20:31:40 +00:00
|
|
|
dl_z "$DIRECTORY_URL" "directory.html"
|
2018-01-01 18:49:07 +00:00
|
|
|
|
2018-01-06 14:07:40 +00:00
|
|
|
# Download all the listing pages
|
2018-01-01 18:49:07 +00:00
|
|
|
while read -r LINK; do
|
|
|
|
FILENAME="$(basename $LINK).html"
|
2018-01-14 20:31:40 +00:00
|
|
|
dl_z "$LINK" "$FILENAME"
|
2018-01-01 18:49:07 +00:00
|
|
|
done <<< $(bundle exec ruby parse_dir.rb)
|
2018-01-01 19:15:51 +00:00
|
|
|
|
2018-01-06 14:07:40 +00:00
|
|
|
# Download all the restaurant pages (~15k)
|
2018-01-01 19:35:00 +00:00
|
|
|
while read -r LINK; do
|
|
|
|
FILENAME="$(basename $LINK).html"
|
2018-01-14 20:31:40 +00:00
|
|
|
echo $FILENAME
|
|
|
|
# sem -j 30 --timeout 300% dl_z "$LINK" "restaurants/$FILENAME"
|
2018-01-06 14:07:40 +00:00
|
|
|
done <<< $(bundle exec ruby parse_listing.rb)
|
|
|
|
|
2018-01-14 20:31:40 +00:00
|
|
|
# bundle exec ruby parse_restaurant.rb
|
2018-01-01 19:35:00 +00:00
|
|
|
|
2018-01-14 20:31:40 +00:00
|
|
|
# sem --wait
|