User:TJones (WMF)/Notes/TextCat Optimization for frwiki eswiki itwiki and dewiki/HQL

The Hive query for frwiki (adapted from relevanceForge/misc/fulltextQueriesSample.hql) is here:

USE wmf_raw;

SET year_min=2016;
SET year_max=2016;
SET month_min=2;
SET month_max=2;
SET day_min=5;
SET day_max=11;

SET max_res=3;
SET wiki='frwiki';
SET index='frwiki_content';

SELECT q FROM (
	SELECT
		-- keep only one query at random per ip/day
		FIRST_VALUE(areq.query) OVER (
			PARTITION BY csr.ip, csr.day
			ORDER BY RAND()
		) AS q,
		csr.ip AS ip,
		-- count the number of queries per day for one IP
		COUNT(csr.ip) OVER (
			PARTITION BY csr.ip, csr.day
		) AS q_by_day
	FROM
		CirrusSearchRequestSet csr
		-- Explode the requests array so we can extract the
		-- last full_text query
		LATERAL VIEW EXPLODE(requests) req AS areq
	WHERE
		year >= ${hiveconf:year_min} AND year <= ${hiveconf:year_max}
		AND month >= ${hiveconf:month_min} AND month <= ${hiveconf:month_max}
		AND day >= ${hiveconf:day_min} and day <= ${hiveconf:day_max}

		-- When the user hit enter it generates a near_match query first.
		AND csr.requests[0].queryType = 'near_match'

		-- Filter the full_text query with the desired number of results
		AND areq.queryType = 'full_text'
		AND areq.hitstotal < ${hiveconf:max_res}

		-- Make sure we extract only the right <wiki>_content
		AND SIZE(areq.indices) == 1
		AND areq.indices[0] = ${hiveconf:index}
		AND wikiid=${hiveconf:wiki}
) queries
WHERE
	queries.q_by_day < 30
DISTRIBUTE BY RAND()
SORT BY RAND()
LIMIT 10000;