uv:
count(distinct)太影响性能,以下为改进:
select
tmp.shop,count(1) as uv
from
(select user_id,shop from second_visit group by user_id,shop)tmp
group by shop;
pv
select shop,count(uid) from second_visit group by shop;
二、MR写法
待填坑
三、spark写法
val conf = new SparkConf()
conf.setAppName("pv_uv")
conf.setMaster("local")
val sc = new SparkContext(conf)
val rdd1: RDD[String] = sc.textFile("F://bigdata/data/access.log")
val rdd2: RDD[String] = rdd1.map((_.split(" ")(0)))
val pv1: Long = rdd2.count()
val pv2: RDD[(String, Int)] = rdd2.map(x=>("pv",1)).reduceByKey(_+_)
println(pv1)
println(pv2.collect().toBuffer)
val uv = rdd2.distinct().count() //这里的两个括号也可以去掉
println(uv)
运行结果:
74
ArrayBuffer((pv,74))
2
运行结果:
作者:zgm12