User:TJones (WMF)/Notes/TextCat Optimization for frwiki eswiki itwiki and dewiki/HQL
The Hive query for frwiki (adapted from relevanceForge/misc/fulltextQueriesSample.hql) is here:
USE wmf_raw;
SET year_min=2016;
SET year_max=2016;
SET month_min=2;
SET month_max=2;
SET day_min=5;
SET day_max=11;
SET max_res=3;
SET wiki='frwiki';
SET index='frwiki_content';
SELECT q FROM (
SELECT
-- keep only one query at random per ip/day
FIRST_VALUE(areq.query) OVER (
PARTITION BY csr.ip, csr.day
ORDER BY RAND()
) AS q,
csr.ip AS ip,
-- count the number of queries per day for one IP
COUNT(csr.ip) OVER (
PARTITION BY csr.ip, csr.day
) AS q_by_day
FROM
CirrusSearchRequestSet csr
-- Explode the requests array so we can extract the
-- last full_text query
LATERAL VIEW EXPLODE(requests) req AS areq
WHERE
year >= ${hiveconf:year_min} AND year <= ${hiveconf:year_max}
AND month >= ${hiveconf:month_min} AND month <= ${hiveconf:month_max}
AND day >= ${hiveconf:day_min} and day <= ${hiveconf:day_max}
-- When the user hit enter it generates a near_match query first.
AND csr.requests[0].queryType = 'near_match'
-- Filter the full_text query with the desired number of results
AND areq.queryType = 'full_text'
AND areq.hitstotal < ${hiveconf:max_res}
-- Make sure we extract only the right <wiki>_content
AND SIZE(areq.indices) == 1
AND areq.indices[0] = ${hiveconf:index}
AND wikiid=${hiveconf:wiki}
) queries
WHERE
queries.q_by_day < 30
DISTRIBUTE BY RAND()
SORT BY RAND()
LIMIT 10000;