From 2f4f18a503f861a5e093b24793131cb75a212633 Mon Sep 17 00:00:00 2001 From: Sihyung Park Date: Thu, 2 May 2019 14:46:36 +0900 Subject: [PATCH 1/7] add web font extractor (beta) --- README.md | 43 ++++++++++++- extract_webfont.py | 147 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+), 2 deletions(-) create mode 100755 extract_webfont.py diff --git a/README.md b/README.md index 40b9b4d..6b46ae6 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,50 @@ -# woff2otf +# WebFontUtil -This is a small utility to convert WOFF files to the OTF font format. It uses Python 3, so you need to have it installed in order to run it. +This is a small utility specialized in extracting and converting web fonts. +It uses Python 3, so you need to have it installed in order to run it. ## Usage +### 1. woff2otf To run the script, simply invoke it from the command line: ``` ./woff2otf.py font.woff font.otf ``` The first parameter is the source file (the WOFF) font, and the second parameter is the output file (in OTF format). + +### 2. web font extractor +To run the script. type in the following command in terminal. +``` +./extract_webfont.py 'https://sample1.com' +``` + +make sure the URLs have 'http' or 'https' header in front of it, and are wrapped with quotation marks(`'`). + +Then, the script will extract all the fonts it can extract from the URL and prompt them as follows. + +``` +============================= WARNING ============================= + Some fonts from the web might be someone's intellectual property + and thus be protected by the corresponding laws. Please be aware + and use this script responsibly. + The programmer of this script and the script itself are not + responsible in any way for problems caused by using the script. +=================================================================== + Select font numbers: (e.g. 0,1) + [0] Together-KwonJungae.woff + [1] KakaoLight.woff + [2] KakaoRegular.woff + [3] KakaoBold.woff + [4] NotoSans-Light.woff + [5] NotoSans-Medium.woff + [6] NotoSans-Regular.woff +``` + + +Just select corresponding numbers of fonts you want to download. In this example, 4, 5, 6. + +``` + Font saved: ./NotoSans-Light.otf + Font saved: ./NotoSans-Medium.otf + Font saved: ./NotoSans-Regular.otf +``` diff --git a/extract_webfont.py b/extract_webfont.py new file mode 100755 index 0000000..71b255c --- /dev/null +++ b/extract_webfont.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +from bs4 import BeautifulSoup +from woff2otf import convert_streams +import requests +import sys +import re +import os + + +def print_warning(): + print("\n============================= WARNING =============================") + print(" Some fonts from the web might be someone's intellectual property") + print(" and thus be protected by the corresponding laws. Please be aware") + print(" and use this script responsibly.") + print(" The programmer of this script and the script itself are not") + print(" responsible in any way for problems caused by using the script.") + print("===================================================================") + return + + +def extract_webfont(URL): + # Extract stylesheet CSS link from the web\ + base_URL = URL[:URL.find("/", 9)] + + response = requests.get(URL) + soup = BeautifulSoup(response.text, "html5lib") + style_links = soup.find_all("link", {"rel": "stylesheet"}) + + for href in style_links: + if "font" in href["href"]: + style_link = href + + + # Get all web font(.woff) links from the CSS + try: + if style_link["href"].startswith("http"): + style_URL = style_link["href"] + elif style_link["href"].startswith("//"): + style_URL = "https:" + style_link["href"] + else: + style_URL = os.path.join(base_URL.strip("/"), style_link["href"].strip("/")) + response = requests.get(style_URL) + except (UnboundLocalError, NameError): + raise NameError("no downloadable fonts found") + style_URL + + + # Parse font CSS into Python dictionary + def get_font(name_link): + links = re.findall(r'url\((.+?)\)format', name_link) + #print(links) + + for idx, link in enumerate(links): + if link.endswith(".ttf"): + is_ttf = True + idx_ttf = idx + break + else: + is_ttf = False + + for idx, link in enumerate(links): + if link.endswith(".woff"): + is_woff = True + idx_woff = idx + break + else: + is_woff = False + + if is_ttf: + return os.path.basename(links[idx_ttf]), links[idx_ttf] + elif is_woff: + return os.path.basename(links[idx_woff]), links[idx_woff] + else: + raise ValueError("no downloadable fonts found") + + response_txt = response.text.replace("\n", "").replace(" ", "").replace("\t", "") + font_family_list = re.findall(r"@font-face{(.*?)}", response_txt) + font_family_list + + font_dict = {get_font(name_link)[0]: get_font(name_link)[1] for name_link in font_family_list} + + font_dict + if not font_dict: + raise ValueError("no downloadable fonts found") + + # Prompt user input + prompt = " Select font numbers: (e.g. 0,1)\n" + for idx, name in enumerate(font_dict.keys()): + prompt += f" [{idx}] {name}\n" + + # comma-separated input. e.g. 1,2,3 + selected = input(prompt) + if selected.replace(",", "").replace(" ", "").isnumeric(): + for i in map(int, selected.replace(" ", "").split(",")): + font_name = list(font_dict.keys())[i] + if list(font_dict.values())[i].startswith("http"): + download_URL = list(font_dict.values())[i] + elif list(font_dict.values())[i].startswith("//"): + download_URL = "https:" + list(font_dict.values())[i] + else: + download_URL = os.path.join(base_URL.strip("/"), list(font_dict.values())[i].strip("/")) + + # write font from the web + if download_URL.lower().endswith(".woff"): + woff_content = requests.get(download_URL).content + woff_fname = font_name + otf_fname = font_name.split(".")[-2] + ".otf" + with open(woff_fname, "wb") as wb: + wb.write(woff_content) + + # convert woff to otf and remove woff, if needed + woff_fhand = open(woff_fname, "rb") + otf_fhand = open(otf_fname, "wb") + convert_streams(woff_fhand, otf_fhand) + woff_fhand.close() + otf_fhand.close() + + os.remove(woff_fname) + print(f" Font saved: ./{otf_fname}") + elif download_URL.lower().endswith(".ttf"): + ttf_content = requests.get(download_URL).content + ttf_fname = font_name + with open(ttf_fname, "wb") as wb: + wb.write(ttf_content) + print(f" Font saved: ./{ttf_fname}") + else: + raise ValueError(f'unknown type: {download_URL.split(".")[-1]}') + + else: + raise ValueError("response should be a number") + + +def main(argv): + if len(argv) == 2: + print_warning() + extract_webfont(argv[1]) + elif len(argv) > 2: + print_warning() + for arg in argv[1:]: + extract_webfont(arg) + print("\n", "="*50, "\n") + else: + raise ValueError("input URL does not exist") + + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file From 6b7cfd408f23e85aafa8778cd484b90cc25ea14a Mon Sep 17 00:00:00 2001 From: naturale0 Date: Thu, 2 May 2019 15:44:18 +0900 Subject: [PATCH 2/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b46ae6..4766a16 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ To run the script. type in the following command in terminal. make sure the URLs have 'http' or 'https' header in front of it, and are wrapped with quotation marks(`'`). -Then, the script will extract all the fonts it can extract from the URL and prompt them as follows. +Then, the script will list all the fonts it can extract from the URL and prompt them as follows. ``` ============================= WARNING ============================= From 10e651232175fb57c887b25f458bcb28b17b0e37 Mon Sep 17 00:00:00 2001 From: Sihyung Park Date: Thu, 2 May 2019 16:00:18 +0900 Subject: [PATCH 3/7] fix bug when parsing URL --- extract_webfont.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/extract_webfont.py b/extract_webfont.py index 71b255c..d445876 100755 --- a/extract_webfont.py +++ b/extract_webfont.py @@ -1,4 +1,22 @@ #!/usr/bin/env python3 +# +# Copyright 2019, Sihyung Park (https://github.com/naturale0) +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# A tool to convert a WOFF back to a TTF/OTF font file, in pure Python + from bs4 import BeautifulSoup from woff2otf import convert_streams import requests @@ -20,12 +38,14 @@ def print_warning(): def extract_webfont(URL): # Extract stylesheet CSS link from the web\ - base_URL = URL[:URL.find("/", 9)] - + if URL.find("/", 9) != -1: + base_URL = URL[:URL.find("/", 9)] + base_URL = URL + response = requests.get(URL) soup = BeautifulSoup(response.text, "html5lib") style_links = soup.find_all("link", {"rel": "stylesheet"}) - + for href in style_links: if "font" in href["href"]: style_link = href From 5000d1b8de034c7a9bf81a1f80eacd33e73080e4 Mon Sep 17 00:00:00 2001 From: Sihyung Park Date: Thu, 2 May 2019 17:03:52 +0900 Subject: [PATCH 4/7] improve extracting ability --- extract_webfont.py | 138 +++++++++++++++++++++++++++------------------ 1 file changed, 84 insertions(+), 54 deletions(-) diff --git a/extract_webfont.py b/extract_webfont.py index d445876..19ed754 100755 --- a/extract_webfont.py +++ b/extract_webfont.py @@ -40,35 +40,44 @@ def extract_webfont(URL): # Extract stylesheet CSS link from the web\ if URL.find("/", 9) != -1: base_URL = URL[:URL.find("/", 9)] - base_URL = URL + else: + base_URL = URL response = requests.get(URL) soup = BeautifulSoup(response.text, "html5lib") style_links = soup.find_all("link", {"rel": "stylesheet"}) - for href in style_links: - if "font" in href["href"]: - style_link = href + #href_fonts = [] + #for href in style_links: + # if "font" in href["href"]: + # href_fonts.append(href) # Get all web font(.woff) links from the CSS try: - if style_link["href"].startswith("http"): - style_URL = style_link["href"] - elif style_link["href"].startswith("//"): - style_URL = "https:" + style_link["href"] - else: - style_URL = os.path.join(base_URL.strip("/"), style_link["href"].strip("/")) - response = requests.get(style_URL) + formatted_href = [] + for style_link in style_links: + if style_link["href"].startswith("http"): + style_URL = style_link["href"] + elif style_link["href"].startswith("//"): + style_URL = "https:" + style_link["href"] + else: + style_URL = os.path.join(base_URL.strip("/"), style_link["href"].strip("/")) + formatted_href.append(style_URL) + except (UnboundLocalError, NameError): raise NameError("no downloadable fonts found") - style_URL + # Remove CSS links that does not contain font-face info + href_with_font = [] + for href in formatted_href: + response = requests.get(href).text + if response.find("font-face") != -1: + href_with_font.append(href) # Parse font CSS into Python dictionary def get_font(name_link): links = re.findall(r'url\((.+?)\)format', name_link) - #print(links) for idx, link in enumerate(links): if link.endswith(".ttf"): @@ -93,13 +102,18 @@ def get_font(name_link): else: raise ValueError("no downloadable fonts found") - response_txt = response.text.replace("\n", "").replace(" ", "").replace("\t", "") - font_family_list = re.findall(r"@font-face{(.*?)}", response_txt) - font_family_list - - font_dict = {get_font(name_link)[0]: get_font(name_link)[1] for name_link in font_family_list} + + font_dict = {} + for style_URL in href_with_font: + response = requests.get(style_URL) + response_txt = response.text.replace("\n", "").replace(" ", "").replace("\t", "") + font_family_list = re.findall(r"@font-face{(.*?)}", response_txt) + + try: + font_dict.update({get_font(name_link)[0]: get_font(name_link)[1] for name_link in font_family_list}) + except ValueError: + continue - font_dict if not font_dict: raise ValueError("no downloadable fonts found") @@ -109,43 +123,59 @@ def get_font(name_link): prompt += f" [{idx}] {name}\n" # comma-separated input. e.g. 1,2,3 + # or * (all available fonts) + ## Download selected fonts! + def download_font_at(i): + font_name = list(font_dict.keys())[i] + if list(font_dict.values())[i].startswith("http"): + download_URL = list(font_dict.values())[i] + elif list(font_dict.values())[i].startswith("//"): + download_URL = "https:" + list(font_dict.values())[i] + else: + download_URL = os.path.join(base_URL.strip("/"), list(font_dict.values())[i].strip("/")) + #print(download_URL) + + # write font from the web + if download_URL.lower().endswith(".woff"): + response = requests.get(download_URL) + if response.status_code == 404: + print( "Failed to download: download page not found (404)") + return + woff_content = response.content + woff_fname = font_name + otf_fname = os.path.splitext(font_name)[0] + ".otf" + with open(woff_fname, "wb") as wb: + wb.write(woff_content) + + # convert woff to otf and remove woff, if needed + woff_fhand = open(woff_fname, "rb") + otf_fhand = open(otf_fname, "wb") + convert_streams(woff_fhand, otf_fhand) + woff_fhand.close() + otf_fhand.close() + + os.remove(woff_fname) + print(f" Font saved: ./{otf_fname}") + elif download_URL.lower().endswith(".ttf"): + response = requests.get(download_URL) + if response.status_code == 404: + print( f"Failed: {font_name} - download page not found (404)") + return + ttf_content = response.content + ttf_fname = font_name + with open(ttf_fname, "wb") as wb: + wb.write(ttf_content) + print(f" Font saved: ./{ttf_fname}") + else: + raise ValueError(f'unknown type: {download_URL.split(".")[-1]}') + selected = input(prompt) if selected.replace(",", "").replace(" ", "").isnumeric(): - for i in map(int, selected.replace(" ", "").split(",")): - font_name = list(font_dict.keys())[i] - if list(font_dict.values())[i].startswith("http"): - download_URL = list(font_dict.values())[i] - elif list(font_dict.values())[i].startswith("//"): - download_URL = "https:" + list(font_dict.values())[i] - else: - download_URL = os.path.join(base_URL.strip("/"), list(font_dict.values())[i].strip("/")) - - # write font from the web - if download_URL.lower().endswith(".woff"): - woff_content = requests.get(download_URL).content - woff_fname = font_name - otf_fname = font_name.split(".")[-2] + ".otf" - with open(woff_fname, "wb") as wb: - wb.write(woff_content) - - # convert woff to otf and remove woff, if needed - woff_fhand = open(woff_fname, "rb") - otf_fhand = open(otf_fname, "wb") - convert_streams(woff_fhand, otf_fhand) - woff_fhand.close() - otf_fhand.close() - - os.remove(woff_fname) - print(f" Font saved: ./{otf_fname}") - elif download_URL.lower().endswith(".ttf"): - ttf_content = requests.get(download_URL).content - ttf_fname = font_name - with open(ttf_fname, "wb") as wb: - wb.write(ttf_content) - print(f" Font saved: ./{ttf_fname}") - else: - raise ValueError(f'unknown type: {download_URL.split(".")[-1]}') - + for idx in map(int, selected.replace(" ", "").split(",")): + download_font_at(idx) + elif selected.strip() == "*": + for idx in range(len(font_dict)): + download_font_at(idx) else: raise ValueError("response should be a number") From db5e10bd00ce7f4cdf2d46fe89b3b5e4e0a4cab7 Mon Sep 17 00:00:00 2001 From: Sihyung Park Date: Thu, 2 May 2019 17:14:18 +0900 Subject: [PATCH 5/7] improve readability --- extract_webfont.py | 150 +++++++++++++++++++++++---------------------- 1 file changed, 77 insertions(+), 73 deletions(-) diff --git a/extract_webfont.py b/extract_webfont.py index 19ed754..4d9f6e0 100755 --- a/extract_webfont.py +++ b/extract_webfont.py @@ -38,6 +38,9 @@ def print_warning(): def extract_webfont(URL): # Extract stylesheet CSS link from the web\ + global base_URL + global font_dict + if URL.find("/", 9) != -1: base_URL = URL[:URL.find("/", 9)] else: @@ -76,33 +79,6 @@ def extract_webfont(URL): href_with_font.append(href) # Parse font CSS into Python dictionary - def get_font(name_link): - links = re.findall(r'url\((.+?)\)format', name_link) - - for idx, link in enumerate(links): - if link.endswith(".ttf"): - is_ttf = True - idx_ttf = idx - break - else: - is_ttf = False - - for idx, link in enumerate(links): - if link.endswith(".woff"): - is_woff = True - idx_woff = idx - break - else: - is_woff = False - - if is_ttf: - return os.path.basename(links[idx_ttf]), links[idx_ttf] - elif is_woff: - return os.path.basename(links[idx_woff]), links[idx_woff] - else: - raise ValueError("no downloadable fonts found") - - font_dict = {} for style_URL in href_with_font: response = requests.get(style_URL) @@ -124,51 +100,7 @@ def get_font(name_link): # comma-separated input. e.g. 1,2,3 # or * (all available fonts) - ## Download selected fonts! - def download_font_at(i): - font_name = list(font_dict.keys())[i] - if list(font_dict.values())[i].startswith("http"): - download_URL = list(font_dict.values())[i] - elif list(font_dict.values())[i].startswith("//"): - download_URL = "https:" + list(font_dict.values())[i] - else: - download_URL = os.path.join(base_URL.strip("/"), list(font_dict.values())[i].strip("/")) - #print(download_URL) - - # write font from the web - if download_URL.lower().endswith(".woff"): - response = requests.get(download_URL) - if response.status_code == 404: - print( "Failed to download: download page not found (404)") - return - woff_content = response.content - woff_fname = font_name - otf_fname = os.path.splitext(font_name)[0] + ".otf" - with open(woff_fname, "wb") as wb: - wb.write(woff_content) - - # convert woff to otf and remove woff, if needed - woff_fhand = open(woff_fname, "rb") - otf_fhand = open(otf_fname, "wb") - convert_streams(woff_fhand, otf_fhand) - woff_fhand.close() - otf_fhand.close() - - os.remove(woff_fname) - print(f" Font saved: ./{otf_fname}") - elif download_URL.lower().endswith(".ttf"): - response = requests.get(download_URL) - if response.status_code == 404: - print( f"Failed: {font_name} - download page not found (404)") - return - ttf_content = response.content - ttf_fname = font_name - with open(ttf_fname, "wb") as wb: - wb.write(ttf_content) - print(f" Font saved: ./{ttf_fname}") - else: - raise ValueError(f'unknown type: {download_URL.split(".")[-1]}') - + ## Download selected fonts! selected = input(prompt) if selected.replace(",", "").replace(" ", "").isnumeric(): for idx in map(int, selected.replace(" ", "").split(",")): @@ -180,6 +112,78 @@ def download_font_at(i): raise ValueError("response should be a number") +def get_font(name_link): + links = re.findall(r'url\((.+?)\)format', name_link) + + for idx, link in enumerate(links): + if link.endswith(".ttf"): + is_ttf = True + idx_ttf = idx + break + else: + is_ttf = False + + for idx, link in enumerate(links): + if link.endswith(".woff"): + is_woff = True + idx_woff = idx + break + else: + is_woff = False + + if is_ttf: + return os.path.basename(links[idx_ttf]), links[idx_ttf] + elif is_woff: + return os.path.basename(links[idx_woff]), links[idx_woff] + else: + raise ValueError("no downloadable fonts found") + + +def download_font_at(i): + font_name = list(font_dict.keys())[i] + if list(font_dict.values())[i].startswith("http"): + download_URL = list(font_dict.values())[i] + elif list(font_dict.values())[i].startswith("//"): + download_URL = "https:" + list(font_dict.values())[i] + else: + download_URL = os.path.join(base_URL.strip("/"), list(font_dict.values())[i].strip("/")) + #print(download_URL) + + # write font from the web + if download_URL.lower().endswith(".woff"): + response = requests.get(download_URL) + if response.status_code == 404: + print( "Failed to download: download page not found (404)") + return + woff_content = response.content + woff_fname = font_name + otf_fname = os.path.splitext(font_name)[0] + ".otf" + with open(woff_fname, "wb") as wb: + wb.write(woff_content) + + # convert woff to otf and remove woff, if needed + woff_fhand = open(woff_fname, "rb") + otf_fhand = open(otf_fname, "wb") + convert_streams(woff_fhand, otf_fhand) + woff_fhand.close() + otf_fhand.close() + + os.remove(woff_fname) + print(f" Font saved: ./{otf_fname}") + elif download_URL.lower().endswith(".ttf"): + response = requests.get(download_URL) + if response.status_code == 404: + print( f"Failed: {font_name} - download page not found (404)") + return + ttf_content = response.content + ttf_fname = font_name + with open(ttf_fname, "wb") as wb: + wb.write(ttf_content) + print(f" Font saved: ./{ttf_fname}") + else: + raise ValueError(f'unknown type: {download_URL.split(".")[-1]}') + + def main(argv): if len(argv) == 2: print_warning() @@ -194,4 +198,4 @@ def main(argv): if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv) From 04a38cb774239ed3137036b29b91e538e011b92b Mon Sep 17 00:00:00 2001 From: naturale0 Date: Mon, 6 May 2019 20:44:00 +0900 Subject: [PATCH 6/7] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4766a16..c9a0877 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,9 @@ To run the script. type in the following command in terminal. ./extract_webfont.py 'https://sample1.com' ``` -make sure the URLs have 'http' or 'https' header in front of it, and are wrapped with quotation marks(`'`). +make sure the URLs have 'http' or 'https' header in front of them, and are wrapped with quotation marks(`'`). -Then, the script will list all the fonts it can extract from the URL and prompt them as follows. +Then, the script will list all of the fonts it can extract from the URL and prompt them as follows. ``` ============================= WARNING ============================= From 774d13c222f295869ca55f1f6a1fc41eb3157683 Mon Sep 17 00:00:00 2001 From: naturale0 Date: Mon, 6 May 2019 20:46:04 +0900 Subject: [PATCH 7/7] fix script description --- extract_webfont.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract_webfont.py b/extract_webfont.py index 4d9f6e0..f3a1fff 100755 --- a/extract_webfont.py +++ b/extract_webfont.py @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# A tool to convert a WOFF back to a TTF/OTF font file, in pure Python +# A tool to extract and download font from input URLs from bs4 import BeautifulSoup from woff2otf import convert_streams