下载搜狗的所有细胞词库

下载搜狗的所有细胞词库
import sys
import requests
from bs4 import BeautifulSoup as BSdef get_links (url):links = []try:r = requests.get(url); r.raise_for_status() # 失败抛出异常links = BS(r.text, 'html.parser').find_all('a')# find_all()找不到时返回[], find则Noneexcept Exception: passr = []for a in links:# Tag有text等属性,但href这类标签属性需要用get# 无href时返回''而非Noner.append((a.text, a.get('href', '')))return rB = 'https://pinyin.sogou.com/'
S = '/dict/detail/index/'
T = '?rf=dictindex'
if len(sys.argv) == 1:for a in get_links(B + 'dict/'):if a[1].find(S) != -1: print(a[0], a[1].replace(S, '').replace(T, ''))
else:for a in get_links((B + S) + sys.argv[1] + T):if a[1].find('download_cell') != -1: print('https:' + a[1])

试了一通wget -r -A '*.scel' 没成功,请AI写了个link_extractor.py改了改。

$ py le.py
...
汽车词汇大全 15153
歌手人名大全 20658
热门电影大全 20652
$ py le.py 15153
...
https://pinyin.sogou.com/d/dict/download_cell.php?id=82331&name=斯柯达
https://pinyin.sogou.com/d/dict/download_cell.php?id=93870&name=考啦维护
https://pinyin.sogou.com/d/dict/download_cell.php?id=93866&name=考啦学车
https://pinyin.sogou.com/d/dict/download_cell.php?id=93868&name=考啦陪驾