PostgreSQL-关联规则的纯SQL实现
该SQL是基于Apriori算法的一个尝试(代码比较粗糙),并且只实现了一推一的情况,对照MadLib的结果做了验证,没有问题。
CREATE OR REPLACE FUNCTION assoc.rules()
RETURNS void
LANGUAGE plpgsql
AS $function$
begin
/*
create view assoc.trans as
select * from trans where purchase_date between '2015-08-01' and '2016-07-31';
*/
create TEMP table assoc_customer_brands ON commit drop as
select distinct on(customer_id, brand) customer_id, brand from assoc.trans;
create TEMP TABLE assoc_customer_brands_0 ON commit drop as
SELECT a.customer_id,a.brand _from, b.brand _to
from assoc_customer_brands a,assoc_customer_brands b
where a.customer_id=b.customer_id and a.brand<>b.brand;
create TEMP TABLE assoc_customer_brands_1 ON commit drop as
select _from, _to, count(*) as count
from assoc_customer_brands_0
GROUP BY 1,2;
CREATE TEMP TABLE assoc_customer_brands_2 ON commit drop as
SELECT brand, count(*) cnt
from assoc_customer_brands
GROUP BY 1;
DROP TABLE IF EXISTS assoc_customer_brands_3;
CREATE TEMP table assoc_customer_brands_3 ON commit drop as
SELECT count(DISTINCT customer_id) as total
from assoc_customer_brands;
-- EXP_CONF=rhs_count/total;
-- LIFT= CONF/EXP_CONF
-- CONF=COUNT/lhs_count
-- SUPPORT= COUNT/total
DROP TABLE IF EXISTS assoc.rules;
CREATE table assoc.rules as
SELECT
r.cnt::DOUBLE PRECISION/total as exp_conf,
a.count::DOUBLE PRECISION/l.cnt as conf ,
a.count::DOUBLE PRECISION/total as support,
(a.count::DOUBLE PRECISION*total)/(l.cnt*r.cnt) lift,
total ,
a.count,
a._from||'==>'||a._to as rule,
a._from _lhand,
a._to _rhand ,
a._from item1 ,
'====================>'::text item2 ,
a._to item3 ,
NULL::text item4
from assoc_customer_brands_1 a
,assoc_customer_brands_2 l
,assoc_customer_brands_2 r
,assoc_customer_brands_3 t
where a._from=l.brand
and a._to=r.brand
;
end;
$function$
关于transaction数据的准备,customer_id 可以是order_id(购物篮), 也可以基于customer_id, brand可以是品牌,产品,品类中的一种,或者品类和品牌的组合(如果需要的话,比如category||’~’||brand)。
结果,可以根据需要对confidence, support和lift进行筛选。
Todo,后续可能会依据PostgreSQL的Array来把所有可能的子集都查出来,然后生成完整的关联规则。