采集soundcloud专辑

1、需要设置代理

2、滚屏加载更多歌曲

3、解析歌曲完整下载路径

import os
import time
import json
import requests
from selenium import webdriver
from scrapy.selector import Selector
from selenium.webdriver.support.ui import WebDriverWait

proxies = {
    "http": "http://192.168.1.88:1088",
    "https": "http://192.168.1.88:1088",
}

def music_download(url):
    file_name = url.split('?')[0].split('/')[-1]
    r = requests.get(url, stream=True)
    with open(f"music/{file_name}", 'wb') as f:
        f.write(r.content)
    if os.path.exists(file_name) and os.path.getsize(file_name) > 1*1000*1000:
        print(f"{file_name} download success, file size: {os.path.getsize(file_name)/1000/1000}M")
    else:
        print (f"{file_name} download fail.")

def music_index(url):
    url_desc_api = f"https://api.soundcloud.com/resolve?url={url}&client_id=LvWovRaJZlWCHql0bISuum8Bd2KX79mb"
    r = requests.get(url_desc_api, proxies=proxies)
    json_r = json.loads(r.text)
    sound_id = json_r['id']
    if not sound_id is None:
        url_download_api = f"https://api.soundcloud.com/i1/tracks/{sound_id}/streams?client_id=LvWovRaJZlWCHql0bISuum8Bd2KX79mb"
        sound_r = requests.get(url_download_api, proxies=proxies)
        json_sound_r = json.loads(sound_r.text)
        print (f"当前任务ID: {sound_id}")
        try:
            music_download_url = json_sound_r['http_mp3_128_url']
            if music_download_url:
                music_download(music_download_url)
        except:
            pass

def soundcloud_index():
    url = "https://soundcloud.com/beyond-synth"
    chromeOptions = webdriver.ChromeOptions()
    # 加入代理功能
    chromeOptions.add_argument(f"--proxy-server=http://192.168.1.88:1088")
    browser = webdriver.Chrome(chrome_options = chromeOptions)     
    browser.get(url)

    # 等待滚屏到最后
    js1 = 'return document.body.scrollHeight'
    js2 = 'window.scrollTo(0, document.body.scrollHeight)'
    old_scroll_height = 0
    while browser.execute_script(js1) >= old_scroll_height:
        old_scroll_height = browser.execute_script(js1)
        browser.execute_script(js2) 
        time.sleep(1)
    # 开始处理页面
    content = browser.find_elements_by_class_name('sound__coverArt')
    count = 1
    for c in content:
        single_url = c.get_attribute('href')
        music_index(single_url)
        time.sleep(1)
        print (f"当前第 {count} 条")
        count += 1
    # 结束任务
    browser.quit()
    
if __name__ == "__main__":
    soundcloud_index()

发表评论

电子邮件地址不会被公开。 必填项已用*标注