<?php /** * @file * Split a single name string into it's name parts (first name, last name, * titles, middle names) */ namespace ADCI\FullNameParser; use ADCI\FullNameParser\Exception\FirstNameNotFoundException; use ADCI\FullNameParser\Exception\FlipStringException; use ADCI\FullNameParser\Exception\IncorrectInputException; use ADCI\FullNameParser\Exception\LastNameNotFoundException; use ADCI\FullNameParser\Exception\ManyMiddleNamesException; use ADCI\FullNameParser\Exception\MultipleMatchesException; use ADCI\FullNameParser\Exception\NameParsingException; /** * Class Parser. * * @package FullNameParser */ class Parser { // <editor-fold desc="Const section."> /* * The regex use is a bit tricky. *Everything* matched by the regex will be replaced, * but you can select a particular parenthesized submatch to be returned. * Also, note that each regex requires that the preceding ones have been run, and matches chopped out. */ /** * Parts with surrounding punctuation as nicknames. * * @var string */ const REGEX_NICKNAMES = "/([\[('‘“\"]+)(.+?)(['’”\"\])]+)/"; /** * Regex for titles. * Each title gets a "\.*" behind it. * It cannot be the last word in name. * * @var string */ const REGEX_TITLES = "/((^| )(%s)\.* )/"; /** * Regex for suffixes. * Before suffix must be space. * Each suffix gets a "\.*" behind it. Numeral suffixes does not contain dots behind it. * After regular suffix can go extra suffixes - comma separated before each word to the end of string. * Or there must be end of string, space or comma after regular suffix. * * @var string */ const REGEX_SUFFIX = "/( (((%s)\.*)|(%s))(((,+ +\S+)*$)|( |,)))/"; /** * Regex for last name. * * @var string */ const REGEX_LAST_NAME = "/(?!^)\b(([^ ]+ y|%s)\.? )*[^ ]+$/i"; /** * Regex for initials. * Note the lookahead, which isn't returned or replaced. * * @var string */ const REGEX_LEADING_INITIAL = "/^(.\.*)(?= \p{L}{2})/"; /** * Regex for first name. * * @var string */ const REGEX_FIRST_NAME = "/^[^ ]+/"; /** * List of possible suffixes. * * @var array */ const SUFFIXES = [ 'esq', 'esquire', 'jr', 'sr', 'phd', ]; /** * List of numeral suffixes. * * @var array */ const NUMERAL_SUFFIXES = [ '2', 'iii', 'ii', 'iv', 'v', ]; /** * List of possible prefixes. * * @var array */ const PREFIXES = [ 'bar', 'ben', 'bin', 'da', 'dal', 'de la', 'de', 'del', 'der', 'di', 'ibn', 'la', 'le', 'san', 'st', 'ste', 'van der', 'van den', 'van', 'vel', 'von', ]; /** * List of normal cased suffixes. * * @var array */ const FORCED_CASE = [ 'e', 'y', 'av', 'af', 'da', 'dal', 'de', 'del', 'der', 'di', 'la', 'le', 'van', 'der', 'den', 'vel', 'von', 'II', 'III', 'IV', 'V', 'J.D.', 'LL.M.', 'M.D.', 'D.O.', 'D.C.', 'Ph.D.', ]; /** * List of possible titles. * * @var array */ const TITLES = ['ms', 'miss', 'mrs', 'mr', 'prof', 'dr']; /** * List of possible parts. * * @var array */ const PARTS = [ 'title', 'first', 'middle', 'last', 'nick', 'suffix', 'error', ]; /** * Return 'all' part by default. * * @var string */ const PART = 'all'; /** * Doesn't fix case by default. * * @var bool */ const FIX_CASE = false; /** * Throw error by default. * * @var bool */ const THROWS = true; // </editor-fold> // <editor-fold desc="Private vars section."> /** * Array of string possible suffixes. * * @var array */ private $suffixes; /** * Array of string possible numeral suffixes. * * @var array */ private $numeral_suffixes; /** * Array of string possible prefixes. * * @var array */ private $prefixes; /** * Array of string possible titles. * * @var array */ private $academic_titles; /** * Temporary variable of non-parsed name part. * * @var string */ private $name_token; /** * Throw error if first name not found. * * @var boolean */ private $mandatory_first_name = true; /** * Throw error if last name not found. * * @var boolean */ private $mandatory_last_name = true; /** * Throw warning if many middle names. * * @var boolean */ private $mandatory_middle_name = true; /** * Object which contains parsed name parts. * * @var Name */ private $name; /** * Name of part to return for. * * @var string */ private $name_part; /** * Throw error if true. * * @var bool */ private $stop_on_error; /** * Fix name case if true. * * @var bool */ private $fix_case; // </editor-fold> /** * Parser constructor. * * Parameter $options is array of options with next keys possible: * - 'suffixes' for an array of suffixes. * - 'prefix' for an array of prefixes. * - 'academic_titles' for an array of titles. * - 'mandatory_first_name' bool. Throw error if first name not found. * - 'mandatory_last_name' bool. Throw error if last name not found. * - 'part' string. Name part to return. Default 'all'. * - 'fix_case' bool. Make name parts uppercase first letter. Default false. * - 'throws' bool. Stop on errors. Default true. * * @param array $options * Array of options. See method description for possible values. */ public function __construct($options = []) { $options += [ 'suffixes' => self::SUFFIXES, 'numeral_suffixes' => self::NUMERAL_SUFFIXES, 'prefixes' => self::PREFIXES, 'academic_titles' => self::TITLES, 'part' => self::PART, 'fix_case' => self::FIX_CASE, 'throws' => self::THROWS, ]; if (array_search(strtolower($options['part']), self::PARTS) === false) { $options['part'] = self::PART; } if (isset($options['mandatory_first_name'])) { $this->mandatory_first_name = (boolean)$options['mandatory_first_name']; } if (isset($options['mandatory_last_name'])) { $this->mandatory_last_name = (boolean)$options['mandatory_last_name']; } if (isset($options['mandatory_middle_name'])) { $this->mandatory_middle_name = (boolean)$options['mandatory_middle_name']; } $this->setStopOnError($options['throws'] == true) ->setFixCase($options['fix_case'] == true) ->setNamePart(strtolower($options['part'])) ->setSuffixes($options['suffixes']) ->setNumeralSuffixes($options['numeral_suffixes']) ->setPrefixes($options['prefixes']) ->setAcademicTitles($options['academic_titles']); } /** * Parse the name into its constituent parts. * * @param string|mixed|null $name * String to parse. * * @return Name|string $name * Parsed name object or part of it. * @throws NameParsingException */ public function parse($name) { $this->name = new Name(); if (is_string($name)) { if ($this->isFixCase()) { $words = explode(' ', $this->normalize($name)); $casedName = []; foreach ($words as $word) { $casedName[] = $this->fixParsedNameCase($word); } $this->name->setFullName(implode(' ', $casedName)); } else { $this->name->setFullName($this->normalize($name)); } $this->name_token = $this->name->getFullName(); $suffixes = implode("|", $this->getSuffixes()); $numeral_suffixes = implode("|", $this->getNumeralSuffixes()); $prefixes = implode("|", $this->getPrefixes()); $academicTitles = implode("|", $this->getAcademicTitles()); $this->findAcademicTitle($academicTitles); $this->findNicknames(); $this->findSuffix($numeral_suffixes, $suffixes); $this->flipNameToken(); $this->findLastName($prefixes); $this->findLeadingInitial(); $this->findFirstName(); $this->findMiddleName(); return $this->name->getPart($this->getNamePart()); } $this->handleError(new IncorrectInputException()); return $this->name->getPart($this->getNamePart()); } /** * Throw exception if set in options. * * @param NameParsingException $ex * Error to throw or add to error array. * * @return self * @throws NameParsingException */ private function handleError(NameParsingException $ex) { $this->name->addError($ex); if ($this->isStopOnError()) { if ($ex instanceof ManyMiddleNamesException) { trigger_error($ex, E_USER_WARNING); } else { throw $ex; } } return $this; } /** * Makes each word in name string ucfirst. * * @param string $word * * @return string */ private function fixParsedNameCase($word) { if ($this->isFixCase()) { $forceCaseList = self::FORCED_CASE; $in_list = false; foreach ($forceCaseList as $item) { if (strtolower($word) === strtolower($item)) { $in_list |= strtolower($word) === strtolower($item); $word = $item; } } if (!$in_list) { $hyphenated = explode('-', $word); foreach ($hyphenated as $id => $part) { $hyphenated[$id] = ucfirst(mb_strtolower($part)); } $word = implode('-', $hyphenated); } } return $word; } /** * Find and add academic title to Name object. * * @param string $academicTitles * Regex to find titles. * * @return self */ private function findAcademicTitle($academicTitles) { $regex = sprintf(self::REGEX_TITLES, $academicTitles); $title = $this->findWithRegex($regex, 1); if ($title) { $this->name->setAcademicTitle($title); $this->name_token = str_ireplace($title, "", $this->name_token); } return $this; } /** * Find and add nicknames to Name object. * * @return self * @throws NameParsingException */ private function findNicknames() { $nicknames = $this->findWithRegex(self::REGEX_NICKNAMES, 2); if ($nicknames) { // Need to fix case because first char was bracket or quote. $this->name->setNicknames($this->fixParsedNameCase($nicknames)); $this->removeTokenWithRegex(self::REGEX_NICKNAMES); } return $this; } /** * Find and add suffixes to Name object. * * @param string $numeral_suffixes * The numeral suffixes to be searched for. * @param string $suffixes * The suffixes to be searched for. * * @return self * @throws NameParsingException */ private function findSuffix($numeral_suffixes, $suffixes) { $regex = sprintf(self::REGEX_SUFFIX, $suffixes, $numeral_suffixes); $suffix = $this->findWithRegex($regex, 1); if ($suffix) { // Remove founded suffix. $regex_suffix = preg_quote($suffix); $this->removeTokenWithRegex("/ ($regex_suffix)($| |,)/", '$2'); $this->name->setSuffix($suffix); } return $this; } /** * Find and add last name to Name object. * * @param string $prefixes * Regex to find prefixes. * * @return self * @throws NameParsingException */ private function findLastName($prefixes) { $regex = sprintf(self::REGEX_LAST_NAME, $prefixes); $lastName = $this->findWithRegex($regex); if ($lastName) { $this->name->setLastName($lastName); $this->removeTokenWithRegex($regex); } elseif ($this->mandatory_last_name) { $this->handleError(new LastNameNotFoundException()); } return $this; } /** * Find and add first name to Name object. * * @return self * @throws NameParsingException */ private function findFirstName() { $lastName = $this->findWithRegex(self::REGEX_FIRST_NAME); if ($lastName) { $this->name->setFirstName($lastName); $this->removeTokenWithRegex(self::REGEX_FIRST_NAME); } elseif ($this->mandatory_first_name) { $this->handleError(new FirstNameNotFoundException()); } return $this; } /** * Find and add leading initial to Name object. * * @return self * @throws NameParsingException */ private function findLeadingInitial() { $leadingInitial = $this->findWithRegex(self::REGEX_LEADING_INITIAL, 1); if ($leadingInitial) { $this->name->setLeadingInitial($leadingInitial); $this->removeTokenWithRegex(self::REGEX_LEADING_INITIAL); } return $this; } /** * Find and add middle name to Name object. * * @return self * @throws NameParsingException */ private function findMiddleName() { $middleName = $this->name_token; $count = count(explode(' ', $middleName)); if ($this->mandatory_middle_name && $count > 2) { $this->handleError(new ManyMiddleNamesException($count)); } if ($middleName) { $this->name->setMiddleName($middleName); } return $this; } /** * Find and return part of name for regex. * * @param string $regex * Regex to search. * @param int $submatchIndex * Index of regex part. * * @return string|bool * Founded part of name. False if not found. */ private function findWithRegex($regex, $submatchIndex = 0) { // unicode + case-insensitive $regex = $regex . "ui"; preg_match($regex, $this->name_token, $match); $subset = (isset($match[$submatchIndex])) ? $match[$submatchIndex] : false; // No need commas and spaces in name parts. $subset = $this->normalize($subset); return $subset; } /** * Remove founded part from name string. * * @param string $regex * Regex to remove name part. * @param string $replacement * String to replace. * * @return self * @throws NameParsingException */ private function removeTokenWithRegex($regex, $replacement = ' ') { $numReplacements = 0; $tokenRemoved = preg_replace($regex . 'ui', $replacement, $this->name_token, -1, $numReplacements); if ($numReplacements > 1) { $this->handleError(new MultipleMatchesException()); } $this->name_token = $this->normalize($tokenRemoved); return $this; } /** * Removes extra whitespace and punctuation from string * Strips whitespace chars from ends, strips redundant whitespace, converts * whitespace chars to " ". * * @param string $taintedString * String to normalize. * * @return string * Normalized string. */ private function normalize($taintedString) { // Remove any kind of invisible character from the start. $taintedString = preg_replace("#^\s*#u", "", $taintedString); // Remove any kind of invisible character from the end. $taintedString = preg_replace("#\s*$#u", "", $taintedString); // Add exception so that non-breaking space characters are not stripped during norm function. if (substr_count($taintedString, "\xc2\xa0") == 0) { // Replace any kind of invisible character in string to whitespace. $taintedString = preg_replace("#\s+#u", " ", $taintedString); } // Replace two commas to one. $taintedString = preg_replace("(, ?, ?)", ", ", $taintedString); // Remove commas and spaces from the string. $taintedString = trim($taintedString, " ,"); return $taintedString; } /** * Flip name around comma. * * @return self * @throws NameParsingException */ private function flipNameToken() { $this->name_token = $this->flipStringPartsAround($this->name_token, ","); return $this; } /** * Flips the front and back parts of a name with one another. * Front and back are determined by a specified character somewhere in the * middle of the string. * * @param string $string * String to flip. * @param string $char * Char to flip around for. * * @return string * Flipped string. * @throws NameParsingException */ private function flipStringPartsAround($string, $char) { $substrings = preg_split("/$char/u", $string); if (count($substrings) == 2) { $string = $substrings[1] . " " . $substrings[0]; $string = $this->normalize($string); } elseif (count($substrings) > 2) { $this->handleError(new FlipStringException($char, $this->name->getFullName())); } return $string; } // <editor-fold desc="Getter/Setter section."> /** * Suffixes getter. * * @return array */ public function getSuffixes() { return $this->suffixes; } /** * Suffixes setter. * * @param array $suffixes * The suffixes to set. * * @return self */ public function setSuffixes($suffixes) { $this->suffixes = $suffixes; return $this; } /** * Numeral suffixes getter. * * @return array */ public function getNumeralSuffixes() { return $this->numeral_suffixes; } /** * Numeral suffixes setter. * * @param array $numeral_suffixes * The numeral suffixes to set. * * @return self */ public function setNumeralSuffixes($numeral_suffixes) { $this->numeral_suffixes = $numeral_suffixes; return $this; } /** * Prefixes getter. * * @return array */ public function getPrefixes() { return $this->prefixes; } /** * Prefixes setter. * * @param array $prefixes * The prefixes. * * @return self */ public function setPrefixes($prefixes) { $this->prefixes = $prefixes; return $this; } /** * Titles getter. * * @return array */ public function getAcademicTitles() { return $this->academic_titles; } /** * Titles setter. * * @param array $academicTitles * The academic titles. * * @return self */ public function setAcademicTitles($academicTitles) { $this->academic_titles = $academicTitles; return $this; } /** * Name part getter. * * @return string */ public function getNamePart() { return $this->name_part; } /** * Name part setter. * * @param string $namePart * Name of part of name to return. * * @return self */ public function setNamePart($namePart) { $this->name_part = $namePart; return $this; } /** * Stop on error getter. * * @return bool */ public function isStopOnError() { return $this->stop_on_error; } /** * Stop on error setter. * * @param bool $stopOnError * Stop when get parse error. * * @return self */ public function setStopOnError($stopOnError) { $this->stop_on_error = $stopOnError; return $this; } /** * Fix case getter. * * @return bool */ public function isFixCase() { return $this->fix_case; } /** * Fix case setter. * * @param bool $fixCase * Fix case when parse. * * @return self */ public function setFixCase($fixCase) { $this->fix_case = $fixCase; return $this; } // </editor-fold> }