Skip to content

Commit

Permalink
update api
Browse files Browse the repository at this point in the history
  • Loading branch information
rachpt committed Apr 6, 2020
1 parent 44986b2 commit 1de9e6d
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 30 deletions.
51 changes: 28 additions & 23 deletions lanzou/api/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,33 +430,33 @@ def _captcha_recognize(self, file_token):
def get_file_info_by_url(self, share_url, pwd='') -> FileDetail:
"""获取文件各种信息(包括下载直链)"""
if not is_file_url(share_url): # 非文件链接返回错误
return FileDetail(LanZouCloud.URL_INVALID)
return FileDetail(LanZouCloud.URL_INVALID, pwd=pwd, url=share_url)

first_page = self._get(share_url) # 文件分享页面(第一页)
if not first_page:
return FileDetail(LanZouCloud.NETWORK_ERROR)
return FileDetail(LanZouCloud.NETWORK_ERROR, pwd=pwd, url=share_url)

first_page = remove_notes(first_page.text) # 去除网页里的注释
if '文件取消' in first_page:
return FileDetail(LanZouCloud.FILE_CANCELLED)
return FileDetail(LanZouCloud.FILE_CANCELLED, pwd=pwd, url=share_url)

# 这里获取下载直链 304 重定向前的链接
if '输入密码' in first_page: # 文件设置了提取码时
if len(pwd) == 0:
return FileDetail(LanZouCloud.LACK_PASSWORD) # 没给提取码直接退出
return FileDetail(LanZouCloud.LACK_PASSWORD, pwd=pwd, url=share_url) # 没给提取码直接退出
# data : 'action=downprocess&sign=AGZRbwEwU2IEDQU6BDRUaFc8DzxfMlRjCjTPlVkWzFSYFY7ATpWYw_c_c&p='+pwd,
sign = re.search(r"sign=(\w+?)&", first_page)
sign = sign.group(1) if sign else ""
post_data = {'action': 'downprocess', 'sign': sign, 'p': pwd}
link_info = self._post(self._host_url + '/ajaxm.php', post_data) # 保存了重定向前的链接信息和文件名
second_page = self._get(share_url) # 再次请求文件分享页面,可以看见文件名,时间,大小等信息(第二页)
if not link_info or not second_page.text:
return FileDetail(LanZouCloud.NETWORK_ERROR)
return FileDetail(LanZouCloud.NETWORK_ERROR, pwd=pwd, url=share_url)
link_info = link_info.json()
second_page = remove_notes(second_page.text)
# 提取文件信息
f_name = link_info['inf']
f_size = re.search(r'大小:(.+?)</div>', second_page)
f_size = re.search(r'大小.+?(\d[\d.]+\s?[BKM]?)<', second_page)
f_size = f_size.group(1) if f_size else ''
f_time = re.search(r'class="n_file_infos">(.+?)</span>', second_page)
f_time = f_time.group(1) if f_time else ''
Expand All @@ -465,22 +465,24 @@ def get_file_info_by_url(self, share_url, pwd='') -> FileDetail:
else: # 文件没有设置提取码时,文件信息都暴露在分享页面上
para = re.search(r'<iframe.*?src="(.+?)"', first_page).group(1) # 提取下载页面 URL 的参数
# 文件名位置变化很多
f_name = re.search(r'<div class="filethetext".+?>([^<>]+?)</div>', first_page) or \
f_name = re.search(r"<title>(.+?) - 蓝奏云</title>", first_page) or \
re.search(r'<div class="filethetext".+?>([^<>]+?)</div>', first_page) or \
re.search(r'<div style="font-size.+?>([^<>].+?)</div>', first_page) or \
re.search(r"var filename = '(.+?)';", first_page) or \
re.search(r'id="filenajax">(.+?)</div>', first_page) # VIP 分享页面
re.search(r'id="filenajax">(.+?)</div>', first_page) or \
re.search(r'<div class="b"><span>([^<>]+?)</span></div>', first_page)
f_name = f_name.group(1) if f_name else "未匹配到文件名"

f_time = re.search(r'上传时间:</span>(.+?)<br>', first_page)
f_time = re.search(r'>(\d+\s?[秒天分小][钟时]?前|[昨前]天\s?[\d:]+?|\d+\s?天前|\d{4}-\d\d-\d\d)<', first_page)
f_time = f_time.group(1) if f_time else ''
f_size = re.search(r'文件大小:</span>(.+?)<br>', first_page) or \
f_size = re.search(r'大小.+?(\d[\d.]+\s?[BKM]?)<', first_page) or \
re.search(r'大小:(.+?)</div>', first_page) # VIP 分享页面
f_size = f_size.group(1) if f_size else ''
f_desc = re.search(r'文件描述</span><br>\n?\s*(.+?)\s*</td>', first_page)
f_desc = re.search(r'文件描述.+?</span><br>\n?\s*(.*?)\s*</td>', first_page)
f_desc = f_desc.group(1) if f_desc else ''
first_page = self._get(self._host_url + para)
if not first_page:
return FileDetail(LanZouCloud.NETWORK_ERROR)
return FileDetail(LanZouCloud.NETWORK_ERROR, name=f_name, time=f_time, size=f_size, desc=f_desc, pwd=pwd, url=share_url)
first_page = remove_notes(first_page.text)
# 一般情况 sign 的值就在 data 里,有时放在变量后面
sign = re.search(r"'sign':(.+?),", first_page).group(1)
Expand All @@ -489,22 +491,24 @@ def get_file_info_by_url(self, share_url, pwd='') -> FileDetail:
post_data = {'action': 'downprocess', 'sign': sign, 'ves': 1}
link_info = self._post(self._host_url + '/ajaxm.php', post_data)
if not link_info:
return FileDetail(LanZouCloud.NETWORK_ERROR)
return FileDetail(LanZouCloud.NETWORK_ERROR, name=f_name, time=f_time, size=f_size, desc=f_desc, pwd=pwd, url=share_url)
else:
link_info = link_info.json()
# 这里开始获取文件直链
if link_info['zt'] == 0: # sign 错误,提示:已超时,请刷新
logger.debug(f"Sign Error: {sign}")
return FileDetail(LanZouCloud.FAILED)
return FileDetail(LanZouCloud.FAILED, pwd=pwd, url=share_url)
elif link_info['zt'] == 1:
fake_url = link_info['dom'] + '/file/' + link_info['url'] # 假直连,存在流量异常检测
download_page = self._get(fake_url, allow_redirects=False)
if not download_page:
return FileDetail(LanZouCloud.NETWORK_ERROR)
download_page.encoding = 'utf-8'
if '网络不正常' in download_page.text: # 流量异常,要求输入验证码
file_token = re.findall(r"'file':'(.+?)'", download_page.text)[0]
direct_url = self._captcha_recognize(file_token)
if not direct_url:
return FileDetail(LanZouCloud.CAPTCHA_ERROR)
return FileDetail(LanZouCloud.CAPTCHA_ERROR, name=f_name, time=f_time, size=f_size, desc=f_desc, pwd=pwd, url=share_url)
else:
direct_url = download_page.headers['Location'] # 重定向后的真直链

Expand All @@ -513,7 +517,7 @@ def get_file_info_by_url(self, share_url, pwd='') -> FileDetail:
name=f_name, size=f_size, type=f_type, time=f_time,
desc=f_desc, pwd=pwd, url=share_url, durl=direct_url)
else:
return FileDetail(LanZouCloud.PASSWORD_ERROR)
return FileDetail(LanZouCloud.FAILED)

def get_file_info_by_id(self, file_id) -> FileDetail:
"""通过 id 获取文件信息"""
Expand Down Expand Up @@ -896,7 +900,7 @@ def down_file_by_url(self, share_url, pwd='', save_path='./Download', callback=N
chunk_size = 4096
last_512_bytes = b'' # 用于识别文件是否携带真实文件名信息
headers = {**self._headers, 'Range': 'bytes=%d-' % now_size}
resp = self._get(info.durl, stream=True, headers=headers)
resp = self._get(info.durl, stream=True, headers=headers, timeout=None)

if resp is None: # 网络异常
return LanZouCloud.FAILED
Expand Down Expand Up @@ -1187,12 +1191,13 @@ def get_share_file_info(self, share_url, pwd=""):
else:
return {"code": LanZouCloud.PASSWORD_ERROR, "info": ""}
else:
f_name = re.findall(r'<div style="[^"]+">([^><]*?)</div>', first_page)
if not f_name:
f_name = re.findall(r"var filename = '(.*)';", first_page)
if not f_name:
f_name = re.findall(r'<div class="filethetext" id="[^"]*">(.*?)?</div>', first_page)
f_name = f_name[0] if f_name else ""
f_name = re.search(r"<title>(.+?) - 蓝奏云</title>", first_page) or \
re.search(r'<div class="filethetext".+?>([^<>]+?)</div>', first_page) or \
re.search(r'<div style="font-size.+?>([^<>].+?)</div>', first_page) or \
re.search(r"var filename = '(.+?)';", first_page) or \
re.search(r'id="filenajax">(.+?)</div>', first_page) or \
re.search(r'<div class="b"><span>([^<>]+?)</span></div>', first_page)
f_name = f_name.group(1) if f_name else "未匹配到文件名"

f_size = re.findall(r'文件大小:</span>([\.0-9 MKBmkbGg]+)<br', first_page)
f_size = f_size[0] if f_size else ""
Expand Down
46 changes: 39 additions & 7 deletions lanzou/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
import re
from datetime import timedelta, datetime
from random import uniform, choices, sample, shuffle, choice
import requests

__all__ = ['logger', 'remove_notes', 'name_format', 'time_format', 'is_name_valid', 'is_file_url',
'is_folder_url', 'big_file_split', 'un_serialize', 'let_me_upload']

# 调试日志设置
logger = logging.getLogger('lanzou')
logger.setLevel(logging.DEBUG)
logger.setLevel(logging.INFO)
formatter = logging.Formatter(
fmt="%(asctime)s [line:%(lineno)d] %(funcName)s %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S")
Expand All @@ -23,11 +24,20 @@
logger.addHandler(console)


headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Referer': 'https://www.lanzous.com',
'Accept-Language': 'zh-CN,zh;q=0.9',
}


def remove_notes(html: str) -> str:
"""删除网页的注释"""
# 去掉 html 里面的 // 和 <!-- --> 注释,防止干扰正则匹配提取数据
# 蓝奏云的前端程序员喜欢改完代码就把原来的代码注释掉,就直接推到生产环境了 =_=
return re.sub(r'<!--.+?-->|\s*//\s*.+', '', html)
html = re.sub(r'<!--.+?-->|\s+//\s*.+', '', html) # html 注释
html = re.sub(r'(.+?[,;])\s*//.+', r'\1', html) # js 注释
return html


def name_format(name: str) -> str:
Expand Down Expand Up @@ -65,14 +75,36 @@ def is_name_valid(filename: str) -> bool:

def is_file_url(share_url: str) -> bool:
"""判断是否为文件的分享链接"""
pat = 'https?://www.lanzous.com/[ti][a-z0-9]{5,}/?'
return True if re.fullmatch(pat, share_url) else False
base_pat = 'https?://www.lanzous.com/.+'
user_pat = 'https?://www.lanzous.com/i[a-z0-9]{5,}/?' # 普通用户 URL 规则
if not re.fullmatch(base_pat, share_url):
return False
elif re.fullmatch(user_pat, share_url):
return True
else: # VIP 用户的 URL 很随意
try:
html = requests.get(share_url, headers=headers).text
html = remove_notes(html)
return True if re.search(r'class="fileinfo"|id="file"|文件描述', html) else False
except (requests.RequestException, Exception):
return False


def is_folder_url(share_url: str) -> bool:
"""判断是否为文件夹的分享链接"""
pat = 'https?://www.lanzous.com/b[a-z0-9]{7,}/?'
return True if re.fullmatch(pat, share_url) else False
base_pat = 'https?://www.lanzous.com/.+'
user_pat = 'https?://www.lanzous.com/b[a-z0-9]{7,}/?'
if not re.fullmatch(base_pat, share_url):
return False
elif re.fullmatch(user_pat, share_url):
return True
else: # VIP 用户的 URL 很随意
try:
html = requests.get(share_url, headers=headers).text
html = remove_notes(html)
return True if re.search(r'id="infos"', html) else False
except (requests.RequestException, Exception):
return False


def un_serialize(data: bytes):
Expand All @@ -82,7 +114,7 @@ def un_serialize(data: bytes):
if not isinstance(ret, dict):
return None
return ret
except (TypeError, pickle.UnpicklingError):
except (TypeError, pickle.UnpicklingError, ValueError):
return None


Expand Down

0 comments on commit 1de9e6d

Please sign in to comment.