Implements parallel streaming downloader
- As the parser returns more restaurant names, it keeps downloading
This commit is contained in:
parent
39f4adfc55
commit
facc07edfa
13
README.md
13
README.md
|
@ -2,13 +2,18 @@
|
||||||
|
|
||||||
Keep track of restaurant openings and closures in the city.
|
Keep track of restaurant openings and closures in the city.
|
||||||
|
|
||||||
# Quirk
|
# Quirks
|
||||||
|
|
||||||
Zomato does not support HTTP/1.1, so wget can't be used.
|
- Zomato does not support HTTP/1.1, so wget can't be used.
|
||||||
|
|
||||||
|
# Tech
|
||||||
|
|
||||||
|
This project uses GNU Parallel, Ruby, Nokogiri, and curl.
|
||||||
|
|
||||||
# Features
|
# Features
|
||||||
|
|
||||||
- Keep track of historical data
|
- Keep track of historical data using regularly generated CSV files
|
||||||
- Does not use the API (since the rate-limit is too low at 1k/day)
|
- Does not use the API (since the rate-limit is too low at 1k/day)
|
||||||
+ We need to checkout around 8k restaurant status
|
+ We need to checkout around 8k restaurant status
|
||||||
|
- Keep track of whether restaurant is still alive or not
|
||||||
|
- Tweet any restaurant closures (or any new openings)
|
||||||
|
|
13
bootstrap.sh
13
bootstrap.sh
|
@ -5,13 +5,15 @@ ZOMATO_CITY=bangalore
|
||||||
DIRECTORY_URL="$ZOMATO_ROOT_URL/$ZOMATO_CITY/directory"
|
DIRECTORY_URL="$ZOMATO_ROOT_URL/$ZOMATO_CITY/directory"
|
||||||
USER_AGENT="Mozilla/Gecko/Firefox/58.0"
|
USER_AGENT="Mozilla/Gecko/Firefox/58.0"
|
||||||
|
|
||||||
mkdir -p html
|
mkdir -p html/restaurants
|
||||||
|
|
||||||
function dl() {
|
function dl_z() {
|
||||||
echo "[+] $2"
|
echo "[+] $2"
|
||||||
curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
|
curl -sS --http2-prior-knowledge --compressed -H "User-Agent: $USER_AGENT" $1 > "html/$2"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export -f dl_z
|
||||||
|
|
||||||
dl "$DIRECTORY_URL" "directory.html"
|
dl "$DIRECTORY_URL" "directory.html"
|
||||||
|
|
||||||
while read -r LINK; do
|
while read -r LINK; do
|
||||||
|
@ -19,4 +21,9 @@ while read -r LINK; do
|
||||||
dl "$LINK" "$FILENAME"
|
dl "$LINK" "$FILENAME"
|
||||||
done <<< $(bundle exec ruby parse_dir.rb)
|
done <<< $(bundle exec ruby parse_dir.rb)
|
||||||
|
|
||||||
bundle exec ruby parse_restaurant.rb
|
while read -r LINK; do
|
||||||
|
FILENAME="$(basename $LINK).html"
|
||||||
|
sem -j 30 --timeout 300% dl_z "$LINK" "restaurants/$FILENAME"
|
||||||
|
done <<< $(bundle exec ruby parse_restaurant.rb)
|
||||||
|
|
||||||
|
sem --wait
|
|
@ -21,6 +21,7 @@ CSV.open("database.csv", "wb") do |csv|
|
||||||
|
|
||||||
csv << [url, title, location, address, cuisine]
|
csv << [url, title, location, address, cuisine]
|
||||||
|
|
||||||
|
puts url
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
Loading…
Reference in New Issue