"""Summary.
COPYRIGHT:
This code is based on 'face_recognition' written by Adam Geitgey (ageitgey),
and modified by Yoshitsugu Kesamaru (yKesamaru).
ORIGINAL AUTHOR:
- Dlib
- davisking
- face_recognition
- ageitgey
- FACE01, and api.py
- yKesamaru
References:
- Dlib:
- https://github.com/davisking/dlib
- Dlib Python API:
- http://dlib.net/python/index.html
- dlib/python_example/face_recognition.py:
- https://github.com/davisking/dlib/blob/master/python_examples/face_recognition.py
- Dlib Face Recognition Model:
- https://github.com/davisking/dlib-models
- Face Recognition:
- https://github.com/ageitgey/face_recognition
- Max-Margin Object Detection(MMOD):
- [Ja] https://blog.chowagiken.co.jp/entry/2019/06/28/OpenCV%E3%81%A8dlib%E3%81%AE%E9%A1%94%E6%A4%9C%E5%87%BA%E6%A9%9F%E8%83%BD%E3%81%AE%E6%AF%94%E8%BC%83
- https://github.com/davisking/dlib-models
- Typing (numpy.typing):
- https://numpy.org/doc/stable/reference/typing.html#typing-numpy-typing
- EfficientNetV2:
- https://arxiv.org/pdf/2104.00298.pdf
- https://github.com/huggingface/pytorch-image-models
- ArcFace:
- https://arxiv.org/pdf/1801.07698.pdf
- MobileFaceNets:
- https://arxiv.org/ftp/arxiv/papers/1804/1804.07573.pdf
NOTE:
About coordinate order...
- dlib: (Left, Top, Right, Bottom), called 'rect'.
- face_recognition: (top, right, bottom, left), called 'css'.
See bellow
https://github.com/davisking/dlib/blob/master/python_examples/face_recognition.py
DEBUG: MEMORY LEAK
.. code-block:: python
from .memory_leak import Memory_leak
m = Memory_leak(limit=2, key_type='traceback', nframe=20)
m.memory_leak_analyze_start()
See bellow:
[Ja] https://zenn.dev/ykesamaru/articles/bd403aa6d03100
"""
from typing import List, Tuple
import dlib
import numpy as np
import numpy.typing as npt # See [Typing (numpy.typing)](https://numpy.org/doc/stable/reference/typing.html#typing-numpy-typing)
from PIL import ImageFile
from PIL import Image
import cv2
ImageFile.LOAD_TRUNCATED_IMAGES = True
from sys import exit
from traceback import format_exc
from face01lib.Calc import Cal
from face01lib.logger import Logger
# from face01lib.video_capture import VidCap
import onnx
import onnxruntime as ort
import torchvision.transforms as transforms
[ドキュメント]
class Dlib_api:
"""Dlib api.
Author: Original code written by Adam Geitgey, modified by YOSHITSUGU KESAMARU
Email: y.kesamaru@tokai-kaoninsho.com
"""
def __init__(
self,
# log_level: str = 'info'
log_level: str = 'error'
) -> None:
"""init.
Args:
log_level (str, optional): Receive log level value. Defaults to 'info'.
"""
# Setup logger: common way
self.log_level: str = log_level
import os.path
name: str = __name__
dir: str = os.path.dirname(__file__)
parent_dir, _ = os.path.split(dir)
# self.logger = Logger(self.log_level).logger(name, parent_dir)
# self.logger.info("COPYRIGHT: TOKAI-KAONINSHO, yKesamaru")
# self.logger.info("FACE01: 商用利用にはライセンスが必要です")
# DEBUG: ログ抑制 #################################################
# Loggerクラスの初期化
self.logger = Logger('error').logger(name, parent_dir) # ログレベルを'error'に設定
# ONNX Runtimeのログ抑制 (例)
os.environ['ORT_LOGGING_LEVEL'] = 'ERROR'
# #################################################################
# cv2のスパムログ出力を抑制 ####################
import logging
logging.getLogger('cv2').setLevel(logging.ERROR)
os.environ['OPENCV_LOG_LEVEL'] = 'ERROR'
# ##############################################
try:
from .models import Models
Models_obj = Models()
except Exception:
self.logger.error("Failed to import dlib model")
self.logger.error("-" * 20)
self.logger.error(format_exc(limit=None, chain=True))
self.logger.error("-" * 20)
exit(0)
# Cal().cal_specify_date(self.logger) # 日付による使用停止処理
self.face_detector = dlib.get_frontal_face_detector() # type: ignore
self.predictor_5_point_model = Models_obj.pose_predictor_five_point_model_location()
self.pose_predictor_5_point = dlib.shape_predictor(
self.predictor_5_point_model) # type: ignore
self.cnn_face_detection_model = Models_obj.cnn_face_detector_model_location()
self.cnn_face_detector = dlib.cnn_face_detection_model_v1(
self.cnn_face_detection_model) # type: ignore
self.dlib_resnet_model = Models_obj.dlib_resnet_model_location()
self.dlib_resnet_face_encoder = dlib.face_recognition_model_v1(
self.dlib_resnet_model) # type: ignore
self.JAPANESE_FACE_V1 = Models_obj.JAPANESE_FACE_V1_model_location()
self.JAPANESE_FACE_V1_model = onnx.load(self.JAPANESE_FACE_V1)
# self.ort_session = ort.InferenceSession(self.JAPANESE_FACE_V1)
self.ort_session = ort.InferenceSession(self.JAPANESE_FACE_V1, providers=[
'CUDAExecutionProvider', 'CPUExecutionProvider'])
# 署名表示
for prop in self.JAPANESE_FACE_V1_model.metadata_props:
if prop.key == "signature":
print(prop.value)
[ドキュメント]
def JAPANESE_FACE_V1_model_compute_face_descriptor(
self,
resized_frame: npt.NDArray[np.uint8],
raw_face_landmark, # dlib.rectangle type.
size: int = 224,
_PADDING: float = 0.1
) -> npt.NDArray[np.float32]:
"""JAPANESE FACE V1モデルを使用して顔の特徴量を計算します。
この関数は、与えられた顔の画像データから、JAPANESE FACE V1モデルを使用して顔の特徴量(embedding)を計算します。
Args:
resized_frame (npt.NDArray[np.uint8]): リサイズされたフレームの画像データ。
raw_face_landmark (dlib.rectangle): 顔のランドマーク情報。
size (int, optional): 顔のチップのサイズ。デフォルトは224。
_PADDING (float, optional): 顔のチップを取得する際のパディング。デフォルトは0.1。
Returns:
npt.NDArray[np.float32]: 顔の特徴量(embedding)。
"""
self.resized_frame: npt.NDArray[np.uint8] = resized_frame
# VidCap().frame_imshow_for_debug(self.resized_frame)
self.raw_face_landmark = raw_face_landmark # dlib.rectangle type.
# print(self.raw_face_landmark)
self.size: int = size
self._PADDING: float = _PADDING
face_image_np: npt.NDArray = dlib.get_face_chip(
self.resized_frame, self.raw_face_landmark, size=self.size, padding=self._PADDING) # type: ignore
# face_imageをBGRからRGBに変換する
face_image_rgb = cv2.cvtColor(
face_image_np, cv2.COLOR_BGR2RGB) # type: ignore
# VidCap().frame_imshow_for_debug(face_image_rgb)
# 入力名を取得
input_name: str = self.JAPANESE_FACE_V1_model.graph.input[0].name
# 画像の前処理を定義
mean_value = [0.485, 0.456, 0.406]
std_value = [0.229, 0.224, 0.225]
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=mean_value,
std=std_value
)
])
# numpy配列からPIL Imageに変換
face_image_pil: Image.Image = Image.fromarray(face_image_rgb)
image = transform(face_image_pil)
image = image.unsqueeze(0) # バッチ次元を追加 # type: ignore
image = image.numpy()
embedding: npt.NDArray[np.float32] = self.ort_session.run(
None, {input_name: image})[0] # 'input'をinput_nameに変更
return embedding
def _rect_to_css(self, rect: dlib.rectangle) -> Tuple[int, int, int, int]:
"""Convert a dlib 'rect' object to a plain tuple in (top, right, bottom, left) order.
This method used only 'use_pipe = False'.
Args:
dlib.rectangle: dlib rect object
Returns:
Tuple[int,int,int,int]: Plain tuple representation of the rect in (top, right, bottom, left) order
"""
self.rect: dlib.rectangle = rect # type: ignore
return self.rect.top(), self.rect.right(), self.rect.bottom(), self.rect.left()
def _css_to_rect(self, css: Tuple[int, int, int, int]) -> dlib.rectangle:
self.css: Tuple[int, int, int, int] = css
"""Convert a tuple in (top, right, bottom, left) order to a dlib 'rect' object
Args:
Tuple[int,int,int,int]: css
- Plain tuple representation of the rect in (top, right, bottom, left) order
Returns:
dlib.rectangle: <class '_dlib_pybind11.rectangle'>
"""
return dlib.rectangle(self.css[3], self.css[0], self.css[1], self.css[2]) # type: ignore
def _trim_css_to_bounds(
self,
css: Tuple[int, int, int, int],
image_shape: Tuple[int, int, int]
) -> Tuple[int, int, int, int]:
self._trim_css_to_bounds_css: Tuple[int, int, int, int] = css
self.image_shape: Tuple[int, int, int] = image_shape
"""Trim 'css' along with border.
Make sure a tuple in (top, right, bottom, left) order is within the bounds of the image.
This method used only 'use_pipe = False'.
Args:
Tuple[int,int,int,int]: css
- Plain tuple representation of the rect in (top, right, bottom, left) order
Tuple[int,int,int]: image_shape
- numpy shape of the image array
Returns:
Tuple[int,int,int,int]: a trimmed plain tuple representation of the rect in (top, right, bottom, left) order
"""
return (
max(self._trim_css_to_bounds_css[0], 0),
min(self._trim_css_to_bounds_css[1], self.image_shape[1]),
min(self._trim_css_to_bounds_css[2], self.image_shape[0]),
max(self._trim_css_to_bounds_css[3], 0)
)
[ドキュメント]
def load_image_file(
self,
file: str,
mode: str = 'RGB'
) -> npt.NDArray[np.uint8]:
"""Loads an image file (.jpg, .png, etc) into a numpy array.
Args:
file (str): image file name or file object to load
mode (str): format to convert the image to. Only 'RGB' (8-bit RGB, 3 channels) and 'L' (black and white) are supported.
Returns:
npt.NDArray[np.uint8]: image contents as numpy array
"""
self.file = file
self.mode = mode
im = Image.open(self.file)
if self.mode:
im = im.convert(self.mode)
return np.array(im)
def _raw_face_locations(
self,
resized_frame: npt.NDArray[np.uint8],
number_of_times_to_upsample: int = 0,
mode: str = "cnn"
) -> List[dlib.rectangle]: # type: ignore
"""Returns an array of bounding boxes of human faces in a image.
This method used only 'use_pipe = False'.
Args:
npt.NDArray[np.uint8]: resized_frame: An image
int: number_of_times_to_upsample
- How many times to upsample the image looking for faces. Higher numbers find smaller faces.
str: mode
- Which face detection mode to use. "hog" is less accurate but faster on CPUs. "cnn" is a more accurate deep-learning mode which is GPU/CUDA accelerated (if available). The default is "hog".
Returns:
List[dlib.rectangle]: a list of dlib 'rect' objects of found face locations
"""
self.resized_frame: npt.NDArray[np.uint8] = resized_frame
self.number_of_times_to_upsample: int = number_of_times_to_upsample
self.mode: str = mode
if self.mode == "cnn":
return self.cnn_face_detector(self.resized_frame, self.number_of_times_to_upsample)
else:
return self.face_detector(self.resized_frame, self.number_of_times_to_upsample)
[ドキュメント]
def face_locations(
self,
resized_frame: npt.NDArray[np.uint8],
number_of_times_to_upsample: int = 0,
mode: str = "hog"
) -> List[Tuple[int, int, int, int]]:
"""Returns an array of bounding boxes of human faces in a image.
This method used only 'use_pipe = False'.
Args:
resized_frame (npt.NDArray[np.uint8]): Resized image
number_of_times_to_upsample (int): How many times to upsample the image looking for faces. Higher numbers find smaller faces.
mode (str): Which face detection mode to use. "hog" is less accurate but faster on CPUs. "cnn" is a more accurate deep-learning mode which is GPU/CUDA accelerated (if available). The default is "hog".
Returns:
A list of tuples of found face locations in css (top, right, bottom, left) order
"""
self.resized_frame: npt.NDArray[np.uint8] = resized_frame
self.number_of_times_to_upsample: int = number_of_times_to_upsample
self.mode: str = mode
face_locations: List[Tuple[int, int, int, int]] = []
if self.mode == 'cnn':
for face in self._raw_face_locations(
self.resized_frame,
self.number_of_times_to_upsample,
self.mode
):
face_locations.append(
self._trim_css_to_bounds(
self._rect_to_css(face.rect),
self.resized_frame.shape
)
)
else:
for face in self._raw_face_locations(
self.resized_frame,
self.number_of_times_to_upsample,
self.mode
):
face_locations.append(
self._trim_css_to_bounds(
self._rect_to_css(face),
self.resized_frame.shape
)
)
return face_locations
def _return_raw_face_landmarks(
self,
resized_frame: npt.NDArray[np.uint8],
face_location_list: List[Tuple[int, int, int, int]],
model: str = "small"
) -> List[dlib.rectangle]: # type: ignore
# type: ignore
new_face_location_list: List[dlib.rectangle[Tuple[int, int, int, int]]] = [
]
raw_face_location: Tuple[int, int, int, int]
for raw_face_location in face_location_list:
new_face_location_list.append(self._css_to_rect(raw_face_location))
# type: ignore
raw_face_landmarks: List[dlib.rectangle[Tuple[int, int, int, int]]] = [
]
# type: ignore
new_face_location: dlib.rectangle[Tuple[int, int, int, int]]
for new_face_location in new_face_location_list:
raw_face_landmarks.append(
self.pose_predictor_5_point(resized_frame, new_face_location)
)
return raw_face_landmarks
[ドキュメント]
def face_encodings(
self,
deep_learning_model: int,
resized_frame: npt.NDArray[np.uint8],
# Initial value of 'face_location_list' is '[]'.
face_location_list: List = [],
num_jitters: int = 0,
model: str = "small"
) -> List[np.ndarray]:
"""Given an image, return the 128-dimension face encoding for each face in the image.
Args:
resized_frame (npt.NDArray[np.uint8]): The image that contains one or more faces (=small_frame)
face_location_list (List): Optional - the bounding boxes of each face if you already know them. (=face_location_list)
num_jitters (int): How many times to re-sample the face when calculating encoding. Higher is more accurate, but slower (i.e. 100 is 100x slower)
model (str): Do not modify.
Returns:
List[npt.NDArray[np.float64]]: A list of 128-dimensional face encodings (one for each face in the image).
If deep_learning_model == 1, the returned list contains 512-dimensional face encodings, with the type
List[npt.NDArray[np.float32]].
Image size: The image should be of size 150x150. Also, cropping must be done as dlib.get_face_chip would do it.
That is, centered and scaled essentially the same way.
See also:
class dlib.face_recognition_model_v1: compute_face_descriptor(*args, **kwargs):
http://dlib.net/python/index.html#dlib_pybind11.face_recognition_model_v1
compute_face_descriptor(*args, **kwargs):
http://dlib.net/python/index.html#dlib_pybind11.face_recognition_model_v1.compute_face_descriptor
"""
self.deep_learning_model: int = deep_learning_model
self.face_encodings_resized_frame: npt.NDArray[np.uint8] = resized_frame
self.face_location_list: List = face_location_list
self.num_jitters: int = num_jitters
self.face_encodings_model: str = model
_PADDING: float = 0.25 # dlib学習モデル用
face_encodings: List[npt.NDArray[np.float64]] = []
if len(self.face_location_list) > 0:
raw_face_landmarks: List = self._return_raw_face_landmarks(
self.face_encodings_resized_frame,
self.face_location_list,
self.face_encodings_model
)
raw_face_landmark: dlib.full_object_detection # type: ignore
face_landmark_ndarray: npt.NDArray[np.float64] = np.array([])
if self.deep_learning_model == 0:
for raw_face_landmark in raw_face_landmarks:
# face_landmark_ndarray: npt.NDArray[np.float64] = []
face_landmark_ndarray: npt.NDArray[np.float64] = np.array(
# Make 128 dimensional vector
self.dlib_resnet_face_encoder.compute_face_descriptor(
self.face_encodings_resized_frame,
raw_face_landmark,
self.num_jitters,
_PADDING
)
)
face_encodings.append(face_landmark_ndarray)
elif self.deep_learning_model == 1:
for raw_face_landmark in raw_face_landmarks:
face_landmark_ndarray: npt.NDArray[np.float64] = np.array(
self.JAPANESE_FACE_V1_model_compute_face_descriptor(
self.face_encodings_resized_frame,
raw_face_landmark,
size=224,
_PADDING=0.1
)
)
face_encodings.append(face_landmark_ndarray) # 修正: 各顔のエンコーディングをリストに追加
return face_encodings
"""
[compute_face_descriptor](https://blog.dlib.net/2017/02/high-quality-face-recognition-with-deep.html?m=0&commentPage=2)
Davis King said...
The landmarks are only used to align the face before the DNN extracts
the face descriptor. How many landmarks you use doesn't really matter.
"""
# TODO: #27 Padding around faces, 0.25
# return [np.array(self.dlib_resnet_face_encoder.compute_face_descriptor(self.face_encodings_resized_frame, raw_landmark_set, self.num_jitters, 0.25)) for raw_landmark_set in raw_landmarks]
# 4th value (0.25) is padding around the face. If padding == 0 then the chip will
# be closely cropped around the face. Setting larger padding values will result a looser cropping.
# In particular, a padding of 0.5 would double the width of the cropped area, a value of 1.
# would triple it, and so forth.
# see bellow
# http://dlib.net/face_recognition.py.html
"""マルチスレッド化
pool = ThreadPoolExecutor()
# pool = ProcessPoolExecutor(max_workers=1) # Error while calling cudaGetDevice(&the_device_id) in file /tmp/pip-install-983gqknr/dlib_66282e4ffadf4aa6965801c6f7ff7671/dlib/cuda/gpu_data.cpp:204. code: 3, reason: initialization error
return [pool.submit(multithread, raw_landmark_set, self.face_encodings_resized_frame, self.num_jitters).result() for raw_landmark_set in raw_landmarks]
"""
[ドキュメント]
def face_distance(
self,
face_encodings: List[npt.NDArray[np.float64]],
face_to_compare: npt.NDArray[np.float64]
# face_encodings: List[np.ndarray],
# face_to_compare: np.ndarray
) -> npt.NDArray[np.float64]:
"""与えられた顔エンコーディングのリストを既知の顔エンコーディングと比較し、各比較顔のユークリッド距離を返します.
距離が近ければ、顔がどれだけ似ているかが分かります。
Args:
face_encodings (List[npt.NDArray[np.float64]]): List of face encodings to compare (=small_frame)
face_to_compare (npt.NDArray[np.float64]): A face encoding to compare against (=face_location_list)
Returns:
npt.NDArray[np.float64]: 顔(名前)配列と同じ順序の、顔同士の距離である numpy ndarray を返します
"""
# self.face_encodings = face_encodings
# self.face_to_compare = face_to_compare
if len(face_encodings) == 0:
# return dummy data
return np.empty((2, 2, 3), dtype=np.float64)
# ord = None -> Frobenius norm. norm for vectors is '2-norm'.
# See:
# document: https://numpy.org/doc/stable/reference/generated/numpy.linalg.norm.html
# [全成分の二乗和のルートをフロベニウスノルムと言います。](https://manabitimes.jp/math/1284)
# > フロベニウスノルムは,行列の全成分を一列に並べてベクトルとみなしたときのベクトルの長さ(2ノルム)と考えることもできます。
return np.linalg.norm(x=(face_encodings - face_to_compare), axis=1)
[ドキュメント]
def cosine_similarity(self, embedding1, embedding2, threshold=0.4):
"""
cosine_similarity 特徴量ベクトルを受け取り、類似度を計算し、閾値を超えているかどうかを返す
Args:
embedding1 (npt.NDArray): feature vector
embedding2 (npt.NDArray): feature vector
threshold (float, optional): threshold. Defaults to 0.4.
Returns:
Tuple[np.array, float]: Returns a tuple of a numpy array of booleans and the minimum cos_sim
"""
results: List[Tuple[bool, float]] = []
max_cos_sim = float(0.0)
embedding2 = embedding2.flatten()
for emb1 in embedding1:
emb1 = emb1.flatten()
cos_sim: float = np.dot(
emb1, embedding2) / (np.linalg.norm(emb1) * np.linalg.norm(embedding2))
if cos_sim >= threshold:
results.append((True, cos_sim))
else:
results.append((False, cos_sim))
max_cos_sim = max(max_cos_sim, cos_sim)
return results, max_cos_sim
# for emb1 in embedding1:
# emb1 = emb1.flatten()
# cos_sim = np.dot(emb1, embedding2) / (np.linalg.norm(emb1) * np.linalg.norm(embedding2))
# results.append(cos_sim >= threshold)
# max_cos_sim= max(max_cos_sim, cos_sim)
# return np.array(results), max_cos_sim
# Percentage calculation
[ドキュメント]
def percentage(self, cos_sim):
"""
percentage 与えられた cos_sim から類似度を計算する
Args:
cos_sim (float): cosine similarity
Returns:
float: percentage of similarity
"""
# BUG: Issue #25
return round(-23.71 * cos_sim ** 2 + 49.98 * cos_sim + 73.69, 2)
[ドキュメント]
def compare_faces(
self,
deep_learning_model: int,
known_face_encodings: List[npt.NDArray[np.float64]],
face_encoding_to_check: npt.NDArray[np.float64],
tolerance: float = 0.6,
threshold: float = 0.4
) -> Tuple[np.ndarray, float]:
"""顔エンコーディングのリストを候補エンコーディングと比較して、それら数値の比較をします。
Args:
deep_learning_model (int): 0: dlib cnn model, 1: JAPANESE_FACE_V1.onnx
known_face_encodings (List[npt.NDArray[np.float64]]): known face encodingsのリスト
face_encoding_to_check (npt.NDArray[np.float64]): リストに対して比較する、単一の顔エンコーディング
tolerance (float): 顔間の距離がどのくらいあれば一致するとみなされるか。dlibの場合、低いほど厳密で、0.6 が一般的な値です。
threshold (float): 閾値
Returns:
どの known_face_encoding がチェック対象の顔エンコーディングに一致するか、およびそれらの間の最小距離を示す True/False 値のタプル。
"""
self.deep_learning_model: int = deep_learning_model
self.known_face_encodings: List[npt.NDArray[np.float64]] = known_face_encodings
self.face_encoding_to_check: npt.NDArray[np.float64] = face_encoding_to_check
self.tolerance: float = tolerance
self.threshold: float = threshold
# dlib model:
if self.deep_learning_model == 0:
face_distance_list: List[float] = list(
self.face_distance(
self.known_face_encodings,
self.face_encoding_to_check
)
)
self.min_distance: float = min(face_distance_list)
if self.min_distance > self.tolerance:
# All elements are 'False' if 'self.min_distance' is greater than 'self.tolerance'.
# return [False] * len(face_distance_list)
return np.full(len(face_distance_list), False), self.min_distance
else:
bool_list: List[Tuple[bool, float]] = []
for face_distance in face_distance_list:
if self.tolerance >= face_distance: # face_distanceがtolerance以下のとき。
bool_list.append((True, face_distance))
elif self.tolerance < face_distance:
bool_list.append((False, face_distance))
else:
exit(1)
return np.array(bool_list), self.min_distance
# bool_list: List[bool] = []
# if self.tolerance >= face_distance:
# bool_list.append(True)
# else:
# bool_list.append(False)
# return np.array(bool_list), self.min_distance
# JAPANESE_FACE_V1 model:
elif self.deep_learning_model == 1:
results: List[Tuple[bool, float]] = []
results, max_cos_sim = \
self.cosine_similarity(
self.known_face_encodings, self.face_encoding_to_check, self.threshold)
return np.array(results), max_cos_sim