update api

rachpt · Apr 6, 2020 · 1de9e6d · 1de9e6d
1 parent 44986b2
commit 1de9e6d
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 30 deletions.
diff --git a/lanzou/api/core.py b/lanzou/api/core.py
@@ -430,33 +430,33 @@ def _captcha_recognize(self, file_token):
     def get_file_info_by_url(self, share_url, pwd='') -> FileDetail:
         """获取文件各种信息(包括下载直链)"""
         if not is_file_url(share_url):  # 非文件链接返回错误
-            return FileDetail(LanZouCloud.URL_INVALID)
+            return FileDetail(LanZouCloud.URL_INVALID, pwd=pwd, url=share_url)
 
         first_page = self._get(share_url)  # 文件分享页面(第一页)
         if not first_page:
-            return FileDetail(LanZouCloud.NETWORK_ERROR)
+            return FileDetail(LanZouCloud.NETWORK_ERROR, pwd=pwd, url=share_url)
 
         first_page = remove_notes(first_page.text)  # 去除网页里的注释
         if '文件取消' in first_page:
-            return FileDetail(LanZouCloud.FILE_CANCELLED)
+            return FileDetail(LanZouCloud.FILE_CANCELLED, pwd=pwd, url=share_url)
 
         # 这里获取下载直链 304 重定向前的链接
         if '输入密码' in first_page:  # 文件设置了提取码时
             if len(pwd) == 0:
-                return FileDetail(LanZouCloud.LACK_PASSWORD)  # 没给提取码直接退出
+                return FileDetail(LanZouCloud.LACK_PASSWORD, pwd=pwd, url=share_url)  # 没给提取码直接退出
             # data : 'action=downprocess&sign=AGZRbwEwU2IEDQU6BDRUaFc8DzxfMlRjCjTPlVkWzFSYFY7ATpWYw_c_c&p='+pwd,
             sign = re.search(r"sign=(\w+?)&", first_page)
             sign = sign.group(1) if sign else ""
             post_data = {'action': 'downprocess', 'sign': sign, 'p': pwd}
             link_info = self._post(self._host_url + '/ajaxm.php', post_data)  # 保存了重定向前的链接信息和文件名
             second_page = self._get(share_url)  # 再次请求文件分享页面，可以看见文件名，时间，大小等信息(第二页)
             if not link_info or not second_page.text:
-                return FileDetail(LanZouCloud.NETWORK_ERROR)
+                return FileDetail(LanZouCloud.NETWORK_ERROR, pwd=pwd, url=share_url)
             link_info = link_info.json()
             second_page = remove_notes(second_page.text)
             # 提取文件信息
             f_name = link_info['inf']
-            f_size = re.search(r'大小：(.+?)</div>', second_page)
+            f_size = re.search(r'大小.+?(\d[\d.]+\s?[BKM]?)<', second_page)
             f_size = f_size.group(1) if f_size else ''
             f_time = re.search(r'class="n_file_infos">(.+?)</span>', second_page)
             f_time = f_time.group(1) if f_time else ''
@@ -465,22 +465,24 @@ def get_file_info_by_url(self, share_url, pwd='') -> FileDetail:
         else:  # 文件没有设置提取码时,文件信息都暴露在分享页面上
             para = re.search(r'<iframe.*?src="(.+?)"', first_page).group(1)  # 提取下载页面 URL 的参数
             # 文件名位置变化很多
-            f_name = re.search(r'<div class="filethetext".+?>([^<>]+?)</div>', first_page) or \
+            f_name = re.search(r"<title>(.+?) - 蓝奏云</title>", first_page) or \
+                     re.search(r'<div class="filethetext".+?>([^<>]+?)</div>', first_page) or \
                      re.search(r'<div style="font-size.+?>([^<>].+?)</div>', first_page) or \
                      re.search(r"var filename = '(.+?)';", first_page) or \
-                     re.search(r'id="filenajax">(.+?)</div>', first_page)  # VIP 分享页面
+                     re.search(r'id="filenajax">(.+?)</div>', first_page) or \
+                     re.search(r'<div class="b"><span>([^<>]+?)</span></div>', first_page)
             f_name = f_name.group(1) if f_name else "未匹配到文件名"
 
-            f_time = re.search(r'上传时间：</span>(.+?)<br>', first_page)
+            f_time = re.search(r'>(\d+\s?[秒天分小][钟时]?前|[昨前]天\s?[\d:]+?|\d+\s?天前|\d{4}-\d\d-\d\d)<', first_page)
             f_time = f_time.group(1) if f_time else ''
-            f_size = re.search(r'文件大小：</span>(.+?)<br>', first_page) or \
+            f_size = re.search(r'大小.+?(\d[\d.]+\s?[BKM]?)<', first_page) or \
                      re.search(r'大小：(.+?)</div>', first_page)  # VIP 分享页面
             f_size = f_size.group(1) if f_size else ''
-            f_desc = re.search(r'文件描述：</span><br>\n?\s*(.+?)\s*</td>', first_page)
+            f_desc = re.search(r'文件描述.+?</span><br>\n?\s*(.*?)\s*</td>', first_page)
             f_desc = f_desc.group(1) if f_desc else ''
             first_page = self._get(self._host_url + para)
             if not first_page:
-                return FileDetail(LanZouCloud.NETWORK_ERROR)
+                return FileDetail(LanZouCloud.NETWORK_ERROR, name=f_name, time=f_time, size=f_size, desc=f_desc, pwd=pwd, url=share_url)
             first_page = remove_notes(first_page.text)
             # 一般情况 sign 的值就在 data 里，有时放在变量后面
             sign = re.search(r"'sign':(.+?),", first_page).group(1)
@@ -489,22 +491,24 @@ def get_file_info_by_url(self, share_url, pwd='') -> FileDetail:
             post_data = {'action': 'downprocess', 'sign': sign, 'ves': 1}
             link_info = self._post(self._host_url + '/ajaxm.php', post_data)
             if not link_info:
-                return FileDetail(LanZouCloud.NETWORK_ERROR)
+                return FileDetail(LanZouCloud.NETWORK_ERROR, name=f_name, time=f_time, size=f_size, desc=f_desc, pwd=pwd, url=share_url)
             else:
                 link_info = link_info.json()
         # 这里开始获取文件直链
         if link_info['zt'] == 0:  # sign 错误，提示：已超时，请刷新
             logger.debug(f"Sign Error: {sign}")
-            return FileDetail(LanZouCloud.FAILED)
+            return FileDetail(LanZouCloud.FAILED, pwd=pwd, url=share_url)
         elif link_info['zt'] == 1:
             fake_url = link_info['dom'] + '/file/' + link_info['url']  # 假直连，存在流量异常检测
             download_page = self._get(fake_url, allow_redirects=False)
+            if not download_page:
+                return FileDetail(LanZouCloud.NETWORK_ERROR)
             download_page.encoding = 'utf-8'
             if '网络不正常' in download_page.text:  # 流量异常，要求输入验证码
                 file_token = re.findall(r"'file':'(.+?)'", download_page.text)[0]
                 direct_url = self._captcha_recognize(file_token)
                 if not direct_url:
-                    return FileDetail(LanZouCloud.CAPTCHA_ERROR)
+                    return FileDetail(LanZouCloud.CAPTCHA_ERROR, name=f_name, time=f_time, size=f_size, desc=f_desc, pwd=pwd, url=share_url)
             else:
                 direct_url = download_page.headers['Location']  # 重定向后的真直链
 
@@ -513,7 +517,7 @@ def get_file_info_by_url(self, share_url, pwd='') -> FileDetail:
                               name=f_name, size=f_size, type=f_type, time=f_time,
                               desc=f_desc, pwd=pwd, url=share_url, durl=direct_url)
         else:
-            return FileDetail(LanZouCloud.PASSWORD_ERROR)
+            return FileDetail(LanZouCloud.FAILED)
 
     def get_file_info_by_id(self, file_id) -> FileDetail:
         """通过 id 获取文件信息"""
@@ -896,7 +900,7 @@ def down_file_by_url(self, share_url, pwd='', save_path='./Download', callback=N
         chunk_size = 4096
         last_512_bytes = b''  # 用于识别文件是否携带真实文件名信息
         headers = {**self._headers, 'Range': 'bytes=%d-' % now_size}
-        resp = self._get(info.durl, stream=True, headers=headers)
+        resp = self._get(info.durl, stream=True, headers=headers, timeout=None)
 
         if resp is None:  # 网络异常
             return LanZouCloud.FAILED
@@ -1187,12 +1191,13 @@ def get_share_file_info(self, share_url, pwd=""):
             else:
                 return {"code": LanZouCloud.PASSWORD_ERROR, "info": ""}
         else:
-            f_name = re.findall(r'<div style="[^"]+">([^><]*?)</div>', first_page)
-            if not f_name:
-                f_name = re.findall(r"var filename = '(.*)';", first_page)
-            if not f_name:
-                f_name = re.findall(r'<div class="filethetext" id="[^"]*">(.*?)?</div>', first_page)
-            f_name = f_name[0] if f_name else ""
+            f_name = re.search(r"<title>(.+?) - 蓝奏云</title>", first_page) or \
+                     re.search(r'<div class="filethetext".+?>([^<>]+?)</div>', first_page) or \
+                     re.search(r'<div style="font-size.+?>([^<>].+?)</div>', first_page) or \
+                     re.search(r"var filename = '(.+?)';", first_page) or \
+                     re.search(r'id="filenajax">(.+?)</div>', first_page) or \
+                     re.search(r'<div class="b"><span>([^<>]+?)</span></div>', first_page)
+            f_name = f_name.group(1) if f_name else "未匹配到文件名"
 
             f_size = re.findall(r'文件大小：</span>([\.0-9 MKBmkbGg]+)<br', first_page)
             f_size = f_size[0] if f_size else ""

diff --git a/lanzou/api/utils.py b/lanzou/api/utils.py
@@ -8,13 +8,14 @@
 import re
 from datetime import timedelta, datetime
 from random import uniform, choices, sample, shuffle, choice
+import requests
 
 __all__ = ['logger', 'remove_notes', 'name_format', 'time_format', 'is_name_valid', 'is_file_url',
            'is_folder_url', 'big_file_split', 'un_serialize', 'let_me_upload']
 
 # 调试日志设置
 logger = logging.getLogger('lanzou')
-logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.INFO)
 formatter = logging.Formatter(
     fmt="%(asctime)s [line:%(lineno)d] %(funcName)s %(levelname)s - %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S")
@@ -23,11 +24,20 @@
 logger.addHandler(console)
 
 
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
+    'Referer': 'https://www.lanzous.com',
+    'Accept-Language': 'zh-CN,zh;q=0.9',
+}
+
+
 def remove_notes(html: str) -> str:
     """删除网页的注释"""
     # 去掉 html 里面的 // 和 <!-- --> 注释，防止干扰正则匹配提取数据
     # 蓝奏云的前端程序员喜欢改完代码就把原来的代码注释掉,就直接推到生产环境了 =_=
-    return re.sub(r'<!--.+?-->|\s*//\s*.+', '', html)
+    html = re.sub(r'<!--.+?-->|\s+//\s*.+', '', html)  # html 注释
+    html = re.sub(r'(.+?[,;])\s*//.+', r'\1', html)  # js 注释
+    return html
 
 
 def name_format(name: str) -> str:
@@ -65,14 +75,36 @@ def is_name_valid(filename: str) -> bool:
 
 def is_file_url(share_url: str) -> bool:
     """判断是否为文件的分享链接"""
-    pat = 'https?://www.lanzous.com/[ti][a-z0-9]{5,}/?'
-    return True if re.fullmatch(pat, share_url) else False
+    base_pat = 'https?://www.lanzous.com/.+'
+    user_pat = 'https?://www.lanzous.com/i[a-z0-9]{5,}/?'  # 普通用户 URL 规则
+    if not re.fullmatch(base_pat, share_url):
+        return False
+    elif re.fullmatch(user_pat, share_url):
+        return True
+    else:  # VIP 用户的 URL 很随意
+        try:
+            html = requests.get(share_url, headers=headers).text
+            html = remove_notes(html)
+            return True if re.search(r'class="fileinfo"|id="file"|文件描述', html) else False
+        except (requests.RequestException, Exception):
+            return False
 
 
 def is_folder_url(share_url: str) -> bool:
     """判断是否为文件夹的分享链接"""
-    pat = 'https?://www.lanzous.com/b[a-z0-9]{7,}/?'
-    return True if re.fullmatch(pat, share_url) else False
+    base_pat = 'https?://www.lanzous.com/.+'
+    user_pat = 'https?://www.lanzous.com/b[a-z0-9]{7,}/?'
+    if not re.fullmatch(base_pat, share_url):
+        return False
+    elif re.fullmatch(user_pat, share_url):
+        return True
+    else:  # VIP 用户的 URL 很随意
+        try:
+            html = requests.get(share_url, headers=headers).text
+            html = remove_notes(html)
+            return True if re.search(r'id="infos"', html) else False
+        except (requests.RequestException, Exception):
+            return False
 
 
 def un_serialize(data: bytes):
@@ -82,7 +114,7 @@ def un_serialize(data: bytes):
         if not isinstance(ret, dict):
             return None
         return ret
-    except (TypeError, pickle.UnpicklingError):
+    except (TypeError, pickle.UnpicklingError, ValueError):
         return None