R语言APRIORI实例

发表评论
14,421 阅读

A+

所属分类：R语言数据分析数据挖掘

Apriori算法是一种挖掘关联规则的频繁项集算法，其核心思想是通过候选集生成和情节的向下封闭检测两个阶段来挖掘频繁项集。

下面做探索性事例：

################################################################################
#########################APRIORI ALGORITHM#######################################
################################################################################
library(DBI)
library(ROracle)
library(arules)
drv=dbDriver('Oracle')
##and gxsj>=trunc(sysdate,'yyyy')
#conn=dbConnect(drv,'zjzhzx','zjzhzx_2016','10.118.1.1:1521/orcl')
conn=dbConnect(drv,'acci_tz','acci_tz','192.111.1.1:1521/jgyw')
#############################基础数据获取#########################################
dataQuery<-dbSendQuery(conn,"with temp as (
select t.tq,t.njd,t.dlaqsx,t.lmzk,t.lbqk,t.lmjg,d.nl,d.jl,t.sglx from dw_acd_file t, dw_acd_filehuman d
where t.sgbh=d.sgbh and lh=10104
and t.tq is not null
and t.njd is not null
and t.dlaqsx is not null
and t.lmzk is not null
and t.lbqk is not null
and t.lmjg is not null
and d.nl is not null
and d.jl is not null
and t.sglx is not null
)
select t1.dmsm1 tq,t2.dmsm1 njd,t3.dmsm1 dlaqsx
, t4.dmsm1 lmzk,t5.dmsm1 lbqk,t6.dmsm1 lmjg
,t7.dmsm1 nl,t8.dmsm1 jl,t9.dmsm1 sglx
from temp t
join dw_dim_tq t1 on t.tq=t1.dmz
join dw_dim_njd t2 on t.njd=t2.dmz
join dw_dim_dlaqsx t3 on t.dlaqsx=t3.dmz
join dw_dim_lmzk t4 on t.lmzk=t4.dmz
join dw_dim_lbqk t5 on t.lbqk=t5.dmz
join dw_dim_lmjg t6 on t.lmjg=t6.dmz
join dw_dim_nl t7 on t.nl=t7.dmz
join dw_dim_jl t8 on t.jl=t8.dmz
join dw_dim_sglx t9 on t.sglx=t9.dmz")
dataBasic<-fetch(dataQuery)
################################需要把元素啊转换为factor##############################
dataBasic$TQ <- as.factor(dataBasic$TQ)
dataBasic$NJD <- as.factor(dataBasic$NJD)
dataBasic$DLAQSX <- as.factor(dataBasic$DLAQSX)
dataBasic$LMZK <- as.factor(dataBasic$LMZK)
dataBasic$LBQK <- as.factor(dataBasic$LBQK)
dataBasic$LMJG <- as.factor(dataBasic$LMJG)
dataBasic$NL <- as.factor(dataBasic$NL)
dataBasic$JL <- as.factor(dataBasic$JL)
dataBasic$SGLX <- as.factor(dataBasic$SGLX)
##############################修改列名，便于展现######################################
colnames(dataBasic)<-c('天气','能见度','道路安全属性','路面状况','路表情况','路面结构','年龄','驾龄','事故类型')
#dataBasic <- as.data.frame(as.factor(dataBasic$TQ))
######################求频繁项集#####################################################
frequentSets <- eclat(dataBasic,parameter = list(support=0.05,maxlen=6))
#察看求得的频繁项集
inspect(frequentSets[1:10])
#根据支持度对求得的频繁项集排序并察看（等价于inspect(sort(frequentsets)[1:10]）
inspect(sort(frequentSets,by="support")[1:10])
####################求关联规则########################################################
rules <- apriori(dataBasic,parameter = list(support=0.01,confidence=0.01))
##################察看求得的关联规则之摘要###############################################
summary(rules)
########对规则进行过滤#################################################################
###############lhs是关联规则的左侧，rhs是关联规则的右侧。####################################
######support,confidence,lift三列分别是支持度，信任度和提升度####################################
##############求所需要的关联规则子集#####################################################
x <- subset(rules,subset=rhs%in%"事故类型=死亡事故")
################根据支持度对求得的关联规则子集排序并察看####################################
val<-inspect(sort(x,by="confidence")[1:20])
inspect(x)
################数据提取并入库#########################################################
vx<-as.matrix(val$lhs)
vy<-as.matrix(val$rhs)
supportx<-as.matrix(val$support)
confidencey<-as.matrix(val$confidence)
resultVal<-cbind(vx,vy,supportx,confidencey)
colnames(resultVal)<-c('ITEMX','ITEMY','SUPPORTX','CONFIDENCEY')
RESULTV<-as.data.frame(resultVal)
dbRemoveTable(conn,'ACD_FACT_APRIOR')
dbWriteTable(conn,'ACD_FACT_APRIOR',RESULTV,row.names = F, append = TRUE)