Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add web font extractor #13

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,50 @@
# woff2otf
# WebFontUtil

This is a small utility to convert WOFF files to the OTF font format. It uses Python 3, so you need to have it installed in order to run it.
This is a small utility specialized in extracting and converting web fonts.
It uses Python 3, so you need to have it installed in order to run it.

## Usage
### 1. woff2otf
To run the script, simply invoke it from the command line:
```
./woff2otf.py font.woff font.otf
```

The first parameter is the source file (the WOFF) font, and the second parameter is the output file (in OTF format).

### 2. web font extractor
To run the script. type in the following command in terminal.
```
./extract_webfont.py 'https://sample1.com'
```

make sure the URLs have 'http' or 'https' header in front of them, and are wrapped with quotation marks(`'`).

Then, the script will list all of the fonts it can extract from the URL and prompt them as follows.

```
============================= WARNING =============================
Some fonts from the web might be someone's intellectual property
and thus be protected by the corresponding laws. Please be aware
and use this script responsibly.
The programmer of this script and the script itself are not
responsible in any way for problems caused by using the script.
===================================================================
Select font numbers: (e.g. 0,1)
[0] Together-KwonJungae.woff
[1] KakaoLight.woff
[2] KakaoRegular.woff
[3] KakaoBold.woff
[4] NotoSans-Light.woff
[5] NotoSans-Medium.woff
[6] NotoSans-Regular.woff
```


Just select corresponding numbers of fonts you want to download. In this example, 4, 5, 6.

```
Font saved: ./NotoSans-Light.otf
Font saved: ./NotoSans-Medium.otf
Font saved: ./NotoSans-Regular.otf
```
201 changes: 201 additions & 0 deletions extract_webfont.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#!/usr/bin/env python3
#
# Copyright 2019, Sihyung Park (https://github.com/naturale0)
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# A tool to extract and download font from input URLs

from bs4 import BeautifulSoup
from woff2otf import convert_streams
import requests
import sys
import re
import os


def print_warning():
print("\n============================= WARNING =============================")
print(" Some fonts from the web might be someone's intellectual property")
print(" and thus be protected by the corresponding laws. Please be aware")
print(" and use this script responsibly.")
print(" The programmer of this script and the script itself are not")
print(" responsible in any way for problems caused by using the script.")
print("===================================================================")
return


def extract_webfont(URL):
# Extract stylesheet CSS link from the web\
global base_URL
global font_dict

if URL.find("/", 9) != -1:
base_URL = URL[:URL.find("/", 9)]
else:
base_URL = URL

response = requests.get(URL)
soup = BeautifulSoup(response.text, "html5lib")
style_links = soup.find_all("link", {"rel": "stylesheet"})

#href_fonts = []
#for href in style_links:
# if "font" in href["href"]:
# href_fonts.append(href)


# Get all web font(.woff) links from the CSS
try:
formatted_href = []
for style_link in style_links:
if style_link["href"].startswith("http"):
style_URL = style_link["href"]
elif style_link["href"].startswith("//"):
style_URL = "https:" + style_link["href"]
else:
style_URL = os.path.join(base_URL.strip("/"), style_link["href"].strip("/"))
formatted_href.append(style_URL)

except (UnboundLocalError, NameError):
raise NameError("no downloadable fonts found")

# Remove CSS links that does not contain font-face info
href_with_font = []
for href in formatted_href:
response = requests.get(href).text
if response.find("font-face") != -1:
href_with_font.append(href)

# Parse font CSS into Python dictionary
font_dict = {}
for style_URL in href_with_font:
response = requests.get(style_URL)
response_txt = response.text.replace("\n", "").replace(" ", "").replace("\t", "")
font_family_list = re.findall(r"@font-face{(.*?)}", response_txt)

try:
font_dict.update({get_font(name_link)[0]: get_font(name_link)[1] for name_link in font_family_list})
except ValueError:
continue

if not font_dict:
raise ValueError("no downloadable fonts found")

# Prompt user input
prompt = " Select font numbers: (e.g. 0,1)\n"
for idx, name in enumerate(font_dict.keys()):
prompt += f" [{idx}] {name}\n"

# comma-separated input. e.g. 1,2,3
# or * (all available fonts)
## Download selected fonts!
selected = input(prompt)
if selected.replace(",", "").replace(" ", "").isnumeric():
for idx in map(int, selected.replace(" ", "").split(",")):
download_font_at(idx)
elif selected.strip() == "*":
for idx in range(len(font_dict)):
download_font_at(idx)
else:
raise ValueError("response should be a number")


def get_font(name_link):
links = re.findall(r'url\((.+?)\)format', name_link)

for idx, link in enumerate(links):
if link.endswith(".ttf"):
is_ttf = True
idx_ttf = idx
break
else:
is_ttf = False

for idx, link in enumerate(links):
if link.endswith(".woff"):
is_woff = True
idx_woff = idx
break
else:
is_woff = False

if is_ttf:
return os.path.basename(links[idx_ttf]), links[idx_ttf]
elif is_woff:
return os.path.basename(links[idx_woff]), links[idx_woff]
else:
raise ValueError("no downloadable fonts found")


def download_font_at(i):
font_name = list(font_dict.keys())[i]
if list(font_dict.values())[i].startswith("http"):
download_URL = list(font_dict.values())[i]
elif list(font_dict.values())[i].startswith("//"):
download_URL = "https:" + list(font_dict.values())[i]
else:
download_URL = os.path.join(base_URL.strip("/"), list(font_dict.values())[i].strip("/"))
#print(download_URL)

# write font from the web
if download_URL.lower().endswith(".woff"):
response = requests.get(download_URL)
if response.status_code == 404:
print( "Failed to download: download page not found (404)")
return
woff_content = response.content
woff_fname = font_name
otf_fname = os.path.splitext(font_name)[0] + ".otf"
with open(woff_fname, "wb") as wb:
wb.write(woff_content)

# convert woff to otf and remove woff, if needed
woff_fhand = open(woff_fname, "rb")
otf_fhand = open(otf_fname, "wb")
convert_streams(woff_fhand, otf_fhand)
woff_fhand.close()
otf_fhand.close()

os.remove(woff_fname)
print(f" Font saved: ./{otf_fname}")
elif download_URL.lower().endswith(".ttf"):
response = requests.get(download_URL)
if response.status_code == 404:
print( f"Failed: {font_name} - download page not found (404)")
return
ttf_content = response.content
ttf_fname = font_name
with open(ttf_fname, "wb") as wb:
wb.write(ttf_content)
print(f" Font saved: ./{ttf_fname}")
else:
raise ValueError(f'unknown type: {download_URL.split(".")[-1]}')


def main(argv):
if len(argv) == 2:
print_warning()
extract_webfont(argv[1])
elif len(argv) > 2:
print_warning()
for arg in argv[1:]:
extract_webfont(arg)
print("\n", "="*50, "\n")
else:
raise ValueError("input URL does not exist")


if __name__ == "__main__":
main(sys.argv)