Update scrape-internet-radio-manual.sh

This commit is contained in:
hossein s. borhani 2022-11-28 15:08:36 +03:30 committed by GitHub
parent c642f2f31d
commit 901f6d1890
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -4,10 +4,10 @@
lynx --dump --listonly --nonumbers https://www.internet-radio.com/stations/ | grep 'https://www.internet-radio.com/stations/' > links.txt lynx --dump --listonly --nonumbers https://www.internet-radio.com/stations/ | grep 'https://www.internet-radio.com/stations/' > links.txt
# strip unnessery part of links (we'll add them later in the for loop) # strip unnessery part of links (we'll add them later in the for loop)
cat links.txt | sed 's!https://www.internet-radio.com/stations/!!' | sed 's/\///g' | sed '/^$/d' | sort | uniq > links2.txt cat links.txt | sed 's!https://www.internet-radio.com/stations/!!' | sed 's/\///g' | sed '/^$/d' | sed -e 's/ /%20/g' | sort | uniq > links2.txt
# scrape links of the streams # scrape links of the streams
for i in "" page{2..10} ; do for j in $(cat links2.txt) ; do curl https://www.internet-radio.com/stations/$j/$i.html | htmlq --attribute href a | grep '.m3u' | cut -b 37- | awk -F '\\listen' '{print $1""}' | awk -F '\\.m3u' '{print $1""}' | awk -F '\\&t=' '{print $1""}' | awk '!seen[$0]++' | sed '/^$/d' >> $j.txt ; sleep 1 ; done ; done for i in "" page{2..10} ; do for j in $(cat links2.txt) ; do curl -s https://www.internet-radio.com/stations/$j/$i.html | htmlq --attribute href a | grep '.m3u' | cut -b 37- | awk -F '\\listen' '{print $1""}' | awk -F '\\.m3u' '{print $1""}' | awk -F '\\&t=' '{print $1""}' | awk '!seen[$0]++' | sed '/^$/d' | awk 'length>10' >> $j.txt ; echo "$j - $i scraped" ; done ; done
# a few links have more than 10 pages, the longest page is pop with 50 pages, so if you abseloutly need all of them you can do those with a longer loop # a few links have more than 10 pages, the longest page is pop with 50 pages, so if you abseloutly need all of them you can do those with a longer loop
# here is the list of the bigger links = Country Talk 80s Oldies Dance Gospel Christian Rock Pop # here is the list of the bigger links = Country Talk 80s Oldies Dance Gospel Christian Rock Pop
@ -15,6 +15,9 @@ for i in "" page{2..10} ; do for j in $(cat links2.txt) ; do curl https://www.in
# convert links to m3u stream files # convert links to m3u stream files
for i in $(cat links2.txt) ; do sed "s/^/#EXTINF:-1\n/" $i.txt | sed '1s/^/#EXTM3U\n/' > $i.m3u ; done for i in $(cat links2.txt) ; do sed "s/^/#EXTINF:-1\n/" $i.txt | sed '1s/^/#EXTM3U\n/' > $i.m3u ; done
# replace %20 in filenames with space
for i in *.m3u ; do mv -- "$i" "$(printf '%s\n' "$i" | sed 's/%20/ /')" ; done
# move stream to git folder # move stream to git folder
mv *.m3u c:/git/m3u-radio-music-playlists/internet-radio/ mv *.m3u c:/git/m3u-radio-music-playlists/internet-radio/