HEX
Server: Apache/2.4.41 (Ubuntu)
System: Linux wordpress-ubuntu-s-2vcpu-4gb-fra1-01 5.4.0-169-generic #187-Ubuntu SMP Thu Nov 23 14:52:28 UTC 2023 x86_64
User: root (0)
PHP: 7.4.33
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare,
Upload Files
File: /var/www/hcv/wp-content/plugins/wordpress-seo-premium/src/actions/link-suggestions-action.php
<?php

namespace Yoast\WP\SEO\Premium\Actions;

use WP_Query;
use WPSEO_Premium_Prominent_Words_Support;
use Yoast\WP\SEO\Models\Indexable;
use Yoast\WP\SEO\Premium\Helpers\Prominent_Words_Helper;
use Yoast\WP\SEO\Premium\Repositories\Prominent_Words_Repository;
use Yoast\WP\SEO\Repositories\Indexable_Repository;
use Yoast\WP\SEO\Repositories\SEO_Links_Repository;

/**
 * Handles the actual requests to the prominent words endpoints.
 */
class Link_Suggestions_Action {

	/**
	 * The amount of indexables to retrieve in one go
	 * when generating internal linking suggestions.
	 */
	const BATCH_SIZE = 1000;

	/**
	 * The repository to retrieve prominent words from.
	 *
	 * @var Prominent_Words_Repository
	 */
	protected $prominent_words_repository;

	/**
	 * The repository to retrieve indexables from.
	 *
	 * @var Indexable_Repository
	 */
	protected $indexable_repository;

	/**
	 * The repository to retrieve links from.
	 *
	 * @var SEO_Links_Repository
	 */
	protected $links_repository;

	/**
	 * Contains helper functions for calculating with and comparing prominent words.
	 *
	 * @var Prominent_Words_Helper
	 */
	protected $prominent_words_helper;

	/**
	 * Represents the prominent words support class.
	 *
	 * @var WPSEO_Premium_Prominent_Words_Support
	 */
	protected $prominent_words_support;

	/**
	 * Link_Suggestions_Service constructor.
	 *
	 * @param Prominent_Words_Repository            $prominent_words_repository The repository to retrieve prominent words from.
	 * @param Indexable_Repository                  $indexable_repository       The repository to retrieve indexables from.
	 * @param Prominent_Words_Helper                $prominent_words_helper     Class with helper methods for prominent words.
	 * @param WPSEO_Premium_Prominent_Words_Support $prominent_words_support    The prominent words support class.
	 * @param SEO_Links_Repository                  $links_repository           The repository to retrieve links from.
	 */
	public function __construct(
		Prominent_Words_Repository $prominent_words_repository,
		Indexable_Repository $indexable_repository,
		Prominent_Words_Helper $prominent_words_helper,
		WPSEO_Premium_Prominent_Words_Support $prominent_words_support,
		SEO_Links_Repository $links_repository
	) {
		$this->prominent_words_repository = $prominent_words_repository;
		$this->indexable_repository       = $indexable_repository;
		$this->prominent_words_helper     = $prominent_words_helper;
		$this->prominent_words_support    = $prominent_words_support;
		$this->links_repository           = $links_repository;
	}

	/**
	 * Suggests a list of links, based on the given array of prominent words.
	 *
	 * @param array  $words_from_request     The prominent words as an array mapping words to weights.
	 * @param int    $limit                  The maximum number of link suggestions to retrieve.
	 * @param int    $object_id              The object id for the current indexable.
	 * @param string $object_type            The object type for the current indexable.
	 * @param bool   $include_existing_links Optional. Whether or not to include existing links, defaults to true.
	 * @param array  $post_type              Optional. The list of post types where suggestions may come from.
	 * @param bool   $only_include_public    Optional. Only include public indexables, defaults to false.
	 *
	 * @return array Links for the post that are suggested.
	 */
	public function get_suggestions( $words_from_request, $limit, $object_id, $object_type, $include_existing_links = true, $post_type = [], $only_include_public = false ) {
		$current_indexable_id = null;
		$current_indexable    = $this->indexable_repository->find_by_id_and_type( $object_id, $object_type );
		if ( $current_indexable ) {
			$current_indexable_id = $current_indexable->id;
		}

		/*
		 * Gets best suggestions (returns a sorted array [$indexable_id => score]).
		 * The indexables are processed in batches of 1000 indexables each.
		 */
		$suggestions_scores = $this->retrieve_suggested_indexable_ids( $words_from_request, $limit, self::BATCH_SIZE, $current_indexable_id, $include_existing_links, $post_type, $only_include_public );

		$indexable_ids = \array_keys( $suggestions_scores );

		// Return the empty list if no suggestions have been found.
		if ( empty( $indexable_ids ) ) {
			return [];
		}

		// Retrieve indexables for suggestions.
		$suggestions_indexables = $this->indexable_repository->query()->where_id_in( $indexable_ids )->find_many();

		/**
		 * Filter 'wpseo_link_suggestions_indexables' - Allow filtering link suggestions indexable objects.
		 *
		 * @api array An array of suggestion indexables that can be filtered.
		 *
		 * @param int    $object_id   The object id for the current indexable.
		 * @param string $object_type The object type for the current indexable.
		 */
		$suggestions_indexables = \apply_filters( 'wpseo_link_suggestions_indexables', $suggestions_indexables, $object_id, $object_type );

		// Create suggestions objects.
		return $this->create_suggestions( $suggestions_indexables, $suggestions_scores );
	}

	/**
	 * Suggests a list of links, based on the given array of prominent words.
	 *
	 * @param int  $id                     The object id for the current indexable.
	 * @param int  $limit                  The maximum number of link suggestions to retrieve.
	 * @param bool $include_existing_links Optional. Whether or not to include existing links, defaults to true.
	 *
	 * @return array Links for the post that are suggested.
	 */
	public function get_indexable_suggestions_for_indexable( $id, $limit, $include_existing_links = true ) {
		$weighted_words  = [];
		$prominent_words = $this->prominent_words_repository->query()
			->where( 'indexable_id', $id )
			->find_array();
		foreach ( $prominent_words as $prominent_word ) {
			$weighted_words[ $prominent_word['stem'] ] = $prominent_word['weight'];
		}

		/*
		 * Gets best suggestions (returns a sorted array [$indexable_id => score]).
		 * The indexables are processed in batches of 1000 indexables each.
		 */
		$suggestions_scores = $this->retrieve_suggested_indexable_ids( $weighted_words, $limit, self::BATCH_SIZE, $id, $include_existing_links );

		$indexable_ids = \array_keys( $suggestions_scores );

		// Return the empty list if no suggestions have been found.
		if ( empty( $indexable_ids ) ) {
			return [];
		}

		// Retrieve indexables for suggestions.
		return $this->indexable_repository->query()->where_id_in( $indexable_ids )->find_array();
	}

	/**
	 * Retrieves the titles of the posts with the given IDs.
	 *
	 * @param array $post_ids The IDs of the posts to retrieve the titles of.
	 *
	 * @return array An array mapping post ID to title.
	 */
	protected function retrieve_posts( $post_ids ) {
		$query = new WP_Query(
			[
				'post_type'      => $this->prominent_words_support->get_supported_post_types(),
				'post__in'       => $post_ids,
				'posts_per_page' => \count( $post_ids ),
			]
		);
		$posts = $query->get_posts();

		$post_data = [];

		foreach ( $posts as $post ) {
			$post_data[ $post->ID ] = [
				'title' => $post->post_title,
			];
		}

		return $post_data;
	}

	/**
	 * Retrieves the names of the terms with the given IDs.
	 *
	 * @param Indexable[] $indexables The indexables to retrieve titles for.
	 *
	 * @return array An array mapping term ID to title.
	 */
	protected function retrieve_terms( $indexables ) {
		$data = [];
		foreach ( $indexables as $indexable ) {
			if ( $indexable->object_type !== 'term' ) {
				continue;
			}

			$term = \get_term_by( 'term_id', $indexable->object_id, $indexable->object_sub_type );

			$data[ $indexable->object_id ] = [
				'title' => $term->name,
			];
		}

		return $data;
	}

	/**
	 * Retrieves the titles of the given array of indexables.
	 *
	 * @param Indexable[] $indexables An array of indexables for which to retrieve the titles.
	 *
	 * @return array A two-dimensional array mapping object type and object id to title.
	 */
	protected function retrieve_object_titles( $indexables ) {
		$object_ids = [];

		foreach ( $indexables as $indexable ) {
			if ( \array_key_exists( $indexable->object_type, $object_ids ) ) {
				$object_ids[ $indexable->object_type ][] = $indexable->object_id;
			}
			else {
				$object_ids[ $indexable->object_type ] = [ $indexable->object_id ];
			}
		}

		$objects = [
			'post' => [],
			'term' => [],
		];

		// At the moment we only support internal linking for posts, so we only need the post titles.
		if ( \array_key_exists( 'post', $object_ids ) ) {
			$objects['post'] = $this->retrieve_posts( $object_ids['post'] );
		}

		if ( \array_key_exists( 'term', $object_ids ) ) {
			$objects['term'] = $this->retrieve_terms( $indexables );
		}

		return $objects;
	}

	/**
	 * Computes, for a given indexable, its raw matching score on the request words to match.
	 * In general, higher scores mean better matches.
	 *
	 * @param array $request_data   The words to match, as an array containing stems, weights and dfs.
	 * @param array $candidate_data The words to match against, as an array of `Prominent_Words` objects.
	 *
	 * @return float A raw score of the indexable.
	 */
	protected function compute_raw_score( $request_data, $candidate_data ) {
		$raw_score = 0;

		foreach ( $candidate_data as $stem => $candidate_word_data ) {
			if ( ! \array_key_exists( $stem, $request_data ) ) {
				continue;
			}

			$word_from_request_weight = $request_data[ $stem ]['weight'];
			$word_from_request_df     = $request_data[ $stem ]['df'];
			$candidate_weight         = $candidate_word_data['weight'];
			$candidate_df             = $candidate_word_data['df'];

			$tf_idf_word_from_request  = $this->prominent_words_helper->compute_tf_idf_score( $word_from_request_weight, $word_from_request_df );
			$tf_idf_word_from_database = $this->prominent_words_helper->compute_tf_idf_score( $candidate_weight, $candidate_df );

			// Score on this word is the product of the tf-idf scores.
			$raw_score += ( $tf_idf_word_from_request * $tf_idf_word_from_database );
		}

		return $raw_score;
	}

	/**
	 * Combines weight data of the request words to their document frequencies. This is needed to calculate
	 * vector length of the request data.
	 *
	 * @param array $request_words An array mapping words to weights.
	 *
	 * @return array An array mapping stems, weights and dfs.
	 */
	protected function compose_request_data( $request_words ) {
		$request_doc_frequencies = $this->prominent_words_repository->count_document_frequencies( \array_keys( $request_words ) );
		$combined_request_data   = [];
		foreach ( $request_words as $stem => $weight ) {
			if ( ! isset( $request_doc_frequencies[ $stem ] ) ) {
				continue;
			}

			$combined_request_data[ $stem ] = [
				'weight' => (int) $weight,
				'df'     => $request_doc_frequencies[ $stem ],
			];
		}

		return $combined_request_data;
	}

	/**
	 * Transforms the array of prominent words into an array of objects mapping indexable_id to an array
	 * of prominent words associated with this indexable_id, with each prominent word's stem as a key.
	 *
	 * @param array $words The array of prominent words, with indexable_id as one of the keys.
	 *
	 * @return array An array mapping indexable IDs to their prominent words.
	 */
	protected function group_words_by_indexable_id( $words ) {
		$candidates_words_by_indexable_ids = [];
		foreach ( $words as $word ) {
			$indexable_id = $word->indexable_id;

			$candidates_words_by_indexable_ids[ $indexable_id ][ $word->stem ] = [
				'weight' => (int) $word->weight,
				'df'     => (int) $word->df,
			];
		}

		return $candidates_words_by_indexable_ids;
	}

	/**
	 * Calculates a matching score for one candidate indexable.
	 *
	 * @param array $request_data          An array matching stems from request to their weights and dfs.
	 * @param float $request_vector_length The vector length of the request words.
	 * @param array $candidate_data        An array matching stems from the candidate to their weights and dfs.
	 *
	 * @return float A matching score for an indexable.
	 */
	protected function calculate_score_for_indexable( $request_data, $request_vector_length, $candidate_data ) {
		$raw_score               = $this->compute_raw_score( $request_data, $candidate_data );
		$candidate_vector_length = $this->prominent_words_helper->compute_vector_length( $candidate_data );
		return $this->normalize_score( $raw_score, $candidate_vector_length, $request_vector_length );
	}

	/**
	 * In the prominent words repository, find a $batch_size of all ProminentWord-IndexableID pairs where
	 * prominent words match the set of stems we are interested in.
	 * Request prominent words for indexables in the batch (including the iDF of all words) to calculate
	 * their vector length later.
	 *
	 * @param array $stems               The stems in the request.
	 * @param int   $batch_size          How many indexables to request in one query.
	 * @param int   $page                The start of the current batch (in pages).
	 * @param int[] $excluded_ids        The indexable IDs to exclude.
	 * @param array $post_type           The post types that will be searched.
	 * @param bool  $only_include_public If only public indexables are included.
	 *
	 * @return array An array of ProminentWords objects, containing their stem, weight, indexable id,
	 *               and document frequency.
	 */
	protected function get_candidate_words( $stems, $batch_size, $page, $excluded_ids = [], $post_type = [], $only_include_public = false ) {

		return $this->prominent_words_repository->find_by_list_of_ids(
			$this->prominent_words_repository->find_ids_by_stems( $stems, $batch_size, $page, $excluded_ids, $post_type, $only_include_public )
		);
	}

	/**
	 * For each candidate indexable, computes their matching score related to the request set of prominent words.
	 * The candidate indexables are analyzed in batches.
	 * After having computed scores for a batch the function saves the best candidates until now.
	 *
	 * @param array    $request_words          The words to match, as an array mapping words to weights.
	 * @param int      $limit                  The max number of suggestions that should be returned by the function.
	 * @param int      $batch_size             The number of indexables that should be analyzed in every batch.
	 * @param int|null $current_indexable_id   The id for the current indexable.
	 * @param bool     $include_existing_links Optional. Whether or not to include existing links, defaults to true.
	 * @param array    $post_type              Optional. The list of post types where suggestions may come from.
	 * @param bool     $only_include_public    Optional. Only include public indexables, defaults to false.
	 *
	 * @return array An array mapping indexable IDs to scores. Higher scores mean better matches.
	 */
	protected function retrieve_suggested_indexable_ids( $request_words, $limit, $batch_size, $current_indexable_id, $include_existing_links = true, $post_type = [], $only_include_public = false ) {
		// Combine stems, weights and DFs from request.
		$request_data = $this->compose_request_data( $request_words );

		// Calculate vector length of the request set (needed for score normalization later).
		$request_vector_length = $this->prominent_words_helper->compute_vector_length( $request_data );

		// Get all links the post already links to, those shouldn't be suggested.
		$excluded_indexable_ids = [ $current_indexable_id ];
		if ( ! $include_existing_links && $current_indexable_id ) {
			$links                  = $this->links_repository->query()
				->distinct()
				->select( 'indexable_id' )
				->where( 'target_indexable_id', $current_indexable_id )
				->find_many();
			$excluded_indexable_ids = \array_merge( $excluded_indexable_ids, \wp_list_pluck( $links, 'indexable_id' ) );
		}
		$excluded_indexable_ids = \array_filter( $excluded_indexable_ids );

		$request_stems = \array_keys( $request_data );
		$scores        = [];
		$page          = 1;

		do {
			// Retrieve the words of all indexables in this batch that share prominent word stems with request.
			$candidates_words = $this->get_candidate_words( $request_stems, $batch_size, $page, $excluded_indexable_ids, $post_type, $only_include_public );

			// Transform the prominent words table so that it is indexed by indexable_ids.
			$candidates_words_by_indexable_ids = $this->group_words_by_indexable_id( $candidates_words );

			$batch_scores_size = 0;

			foreach ( $candidates_words_by_indexable_ids as $id => $candidate_data ) {
				$scores[ $id ] = $this->calculate_score_for_indexable( $request_data, $request_vector_length, $candidate_data );
				++$batch_scores_size;
			}

			// Sort the list of scores and keep only the top $limit of the scores.
			$scores = $this->get_top_suggestions( $scores, $limit );

			++$page;
		} while ( $batch_scores_size === $batch_size );

		return $scores;
	}

	/**
	 * Normalizes the raw score based on the length of the prominent word vectors.
	 *
	 * @param float $raw_score               The raw (non-normalized) score.
	 * @param float $vector_length_candidate The vector lengths of the candidate indexable.
	 * @param float $vector_length_request   The vector length of the words from the request.
	 *
	 * @return int|float The score, normalized on vector lengths.
	 */
	protected function normalize_score( $raw_score, $vector_length_candidate, $vector_length_request ) {
		$normalizing_factor = ( $vector_length_request * $vector_length_candidate );

		if ( $normalizing_factor === 0.0 ) {
			// We can't divide by 0, so set the score to 0 instead.
			return 0;
		}

		return ( $raw_score / $normalizing_factor );
	}

	/**
	 * Sorts the indexable ids based on the score and returns the top N indexable ids based on a specified limit.
	 * (Returns all indexable ids if there are less indexable ids than specified by the limit.)
	 *
	 * @param array $scores The array matching indexable ids to their scores.
	 * @param int   $limit  The maximum number of indexables that should be returned.
	 *
	 * @return array The top N indexable ids, sorted from highest to lowest score.
	 */
	protected function get_top_suggestions( $scores, $limit ) {
		// Sort the indexables by descending score.
		\uasort(
			$scores,
			static function ( $score_1, $score_2 ) {
				if ( $score_1 === $score_2 ) {
					return 0;
				}

				return ( ( $score_1 < $score_2 ) ? 1 : -1 );
			}
		);

		// Take the top $limit suggestions, while preserving their ids specified in the keys of the array elements.
		return \array_slice( $scores, 0, $limit, true );
	}

	/**
	 * Gets the singular label of the given combination of object type and sub type.
	 *
	 * @param string $object_type     An object type. For example 'post' or 'term'.
	 * @param string $object_sub_type An object sub type. For example 'page' or 'category'.
	 *
	 * @return string The singular label of the given combination of object type and sub type,
	 *                or the empty string if the singular label does not exist.
	 */
	protected function get_sub_type_singular_label( $object_type, $object_sub_type ) {
		switch ( $object_type ) {
			case 'post':
				$post_type = \get_post_type_object( $object_sub_type );
				if ( $post_type ) {
					return $post_type->labels->singular_name;
				}
				break;
			case 'term':
				$taxonomy = \get_taxonomy( $object_sub_type );
				if ( $taxonomy ) {
					return $taxonomy->labels->singular_name;
				}
				break;
		}

		return '';
	}

	/**
	 * Creates link suggestion data based on the indexables that should be suggested and the scores for these
	 * indexables.
	 *
	 * @param Indexable[] $indexables The indexables for which to create linking suggestions.
	 * @param array       $scores     The scores for the linking suggestions.
	 *
	 * @return array The internal linking suggestions.
	 */
	protected function create_suggestions( $indexables, $scores ) {
		$objects          = $this->retrieve_object_titles( $indexables );
		$link_suggestions = [];

		foreach ( $indexables as $indexable ) {
			if ( ! \array_key_exists( $indexable->object_type, $objects ) ) {
				continue;
			}

			// Object tied to this indexable. E.g. post, page, term.
			if ( ! \array_key_exists( $indexable->object_id, $objects[ $indexable->object_type ] ) ) {
				continue;
			}

			$link_suggestions[] = [
				'object_type'   => $indexable->object_type,
				'id'            => (int) ( $indexable->object_id ),
				'title'         => $objects[ $indexable->object_type ][ $indexable->object_id ]['title'],
				'link'          => $indexable->permalink,
				'isCornerstone' => (bool) $indexable->is_cornerstone,
				'labels'        => $this->get_labels( $indexable ),
				'score'         => \round( (float) ( $scores[ $indexable->id ] ), 2 ),
			];
		}

		/*
		 * Because the request to the indexables table messes up with the ordering of the suggestions,
		 * we have to sort again.
		 */
		$this->sort_suggestions_by_field( $link_suggestions, 'score' );

		$cornerstone_suggestions     = $this->filter_suggestions( $link_suggestions, true );
		$non_cornerstone_suggestions = $this->filter_suggestions( $link_suggestions, false );

		return \array_merge_recursive( [], $cornerstone_suggestions, $non_cornerstone_suggestions );
	}

	/**
	 * Retrieves the labels for the link suggestion.
	 *
	 * @param Indexable $indexable The indexable to determine the labels for.
	 *
	 * @return array The labels.
	 */
	protected function get_labels( Indexable $indexable ) {
		$labels = [];
		if ( $indexable->is_cornerstone ) {
			$labels[] = 'cornerstone';
		}

		$labels[] = $this->get_sub_type_singular_label( $indexable->object_type, $indexable->object_sub_type );

		return $labels;
	}

	/**
	 * Sorts the given link suggestion by field.
	 *
	 * @param array  $link_suggestions The link suggestions to sort.
	 * @param string $field            The field to sort suggestions by.
	 */
	protected function sort_suggestions_by_field( array &$link_suggestions, $field ) {
		\usort(
			$link_suggestions,
			static function ( $suggestion_1, $suggestion_2 ) use ( $field ) {
				if ( $suggestion_1[ $field ] === $suggestion_2[ $field ] ) {
					return 0;
				}

				return ( ( $suggestion_1[ $field ] < $suggestion_2[ $field ] ) ? 1 : -1 );
			}
		);
	}

	/**
	 * Filters the suggestions by cornerstone status.
	 *
	 * @param array $link_suggestions The suggestions to filter.
	 * @param bool  $cornerstone      Whether or not to include the cornerstone suggestions.
	 *
	 * @return array The filtered suggestions.
	 */
	protected function filter_suggestions( $link_suggestions, $cornerstone ) {
		return \array_filter(
			$link_suggestions,
			static function ( $suggestion ) use ( $cornerstone ) {
				return (bool) $suggestion['isCornerstone'] === $cornerstone;
			}
		);
	}
}