class HQLLinearMakeFeature IUMParser HQLQuery INSERT OVERWRITE TABLE D

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class HQLLinearMakeFeature(IUMParser):
HQLQuery = """
INSERT OVERWRITE TABLE {DST_TABLE} PARTITION (dt = '{DAYRECOM}')
SELECT tmp_features.ruid, NormFeaturesStrUDF(tmp_features.likelihood, '{SOURCE}',0, false), tmp_domain_features.normed_features
FROM (
SELECT ruid, likelihood
FROM perso_ruid_domain_likelihood
WHERE
dt = '{DAYRECOM}' AND
MakeHash(concat('ruid=', ruid, 'AHB')) <= (4294967295 * 1) AND
MakeHash(concat('ruid=', ruid, 'ABH')) < (4294967295 * 0.9)
) tmp_features
JOIN (
SELECT DISTINCT ruid
FROM factor_url_domain_day
WHERE
day >= '{DAYRECOM}' AND
MakeHash(concat('ruid=', ruid, 'AHB')) <= (4294967295 * 1) AND
MakeHash(concat('ruid=', ruid, 'ABH')) < (4294967295 * 0.9) AND
url LIKE '%rambler%'
) tmp_active_ruids
on tmp_features.ruid = tmp_active_ruids.ruid
JOIN (
SELECT ruid, NormFeaturesStrUDF(collect_set(concat(url, ':', days)), '{TOP_URLS_STR}', 0, true) AS normed_features
FROM factor_url_domain_month
WHERE
day = '{DAYLEARN}' AND
counts > 1 AND
array_contains(array({TOP_URLS}), url) AND
MakeHash(concat('ruid=', ruid, 'AHB')) <= (4294967295 * 1) AND
MakeHash(concat('ruid=', ruid, 'ABH')) < (4294967295 * 0.9)
GROUP BY ruid
) tmp_domain_features
on tmp_features.ruid = tmp_domain_features.ruid
"""
def get_teamplate(self):
header = HQLHeader([
'--add-jar', '/usr/local/hive/lib/hive-contrib-0.7.1-cdh3u6.jar',
'--add-jar', '/home/akhuraskin/devel/statlab/ad_hoc/linear/hive-functions.jar',
'--add-jar', '/home/memely/statlab/ad_hoc/linear/projects/linear/src/NormFeaturesStrUDF/hive-normfeaturesstrudf.jar',
'--add-function', "MakeHash as 'ru.rambler.hadoop.hive.MakeHash'",
'--add-function', "NormFeaturesStrUDF as 'ru.rambler.hadoop.hive.NormFeaturesStrUDF'",
'--num-reducer', '256',
'--job-name', 'Make feature for apply linear'
]).do()
return header + self.HQLQuery
def processArgs(self, args):
params = vars(args)
params['TOP_URLS_STR'] = params['TOP_URLS'].replace("'", '')
QUERY = self.get_teamplate()
QUERY = QUERY.format(**params)
if args.PRINT:
print QUERY
if args.EXEC:
executeHiveQL(QUERY)
def params(self):
dict_params = {}
addOption(dict_params, 'DST_TABLE', BANK['DST_TABLE'])
addOption(dict_params, 'EXEC', BANK['EXEC'])
addOption(dict_params, 'PRINT', BANK['PRINT'])
addOption(dict_params, 'DAYRECOM', {'parser': {'args': ['--day_recom'], 'kwargs': {'required': True}}})
addOption(dict_params, 'DAYLEARN', {'parser': {'args': ['--day_learn'], 'kwargs': {'required': True}}})
addOption(dict_params, 'TOP_URLS', {'parser': {'args': ['--top_urls'], 'kwargs': {'required': True}}})
addOption(dict_params, 'SOURCE', {'parser': {'args': ['--source'], 'kwargs': {'required': True}}})
return dict_params