数据说明: 姓名|网址|访问时间 需求:分别计算每个用户在页面的停留时长。写入MySQL。 结果数据: 用户 网址 停留时长

Serepta ·
更新时间:2024-09-21
· 617 次阅读

tom|www.baidu.com?page=1|2019-10-28 13:40:20
jim|www.sohu.com?page=1|2019-10-28 13:40:25
susan|www.zhihu.com?page=1|2019-10-28 13:40:28
tom|www.baidu.com?page=2|2019-10-28 13:40:32
tom|www.baidu.com?page=3|2019-10-28 13:40:36
susan|www.zhihu.com?page=2|2019-10-28 13:40:40
jim|www.sohu.com?page=2|2019-10-28 13:40:50

import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext object Need4 { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[1]").setAppName(this.getClass.getSimpleName) val sc = new SparkContext(conf) val sQLContext = new SQLContext(sc) import sQLContext.implicits._ val sorceFile = sc.textFile("E:\\五道口大数据\\吉祥spark开始\\20200204\\sparkSQL第一天需求\\需求4\\pageaccess.txt") val df = sorceFile.map(line => { val split = line.split("[|]") // val pvtime = split(2).split(":")(2) (split(0), split(1),split(2)) }).toDF("name", "url", "time") df.createTempView("user") sQLContext.sql("select name,url,time ,lead(time,1,0) over (partition by name order by time asc) endtime," + "UNIX_TIMESTAMP(lead(time,1,0) over(partition by name order by time),'yyyy-MM-dd HH:mm:ss')- UNIX_TIMESTAMP(time,'yyyy-MM-dd HH:mm:ss') period " + "from user " ).show() //.coalesce(1).write.csv("D:\\djz\\20200204\\a") /** * select user,wz,nvl(sum(period),0) totol_peroid from(select user,dl,UNIX_TIMESTAMP(lead(dl) over(partition by user order by dl),'yyyy-MM-dd HH:mm:ss')- UNIX_TIMESTAMP(dl,'yyyy-MM-dd HH:mm:ss') period,wz from dop) a group by wz,user") */ //sQLContext.sql("select name,url,nvl(sum(time)) total_time from (select name,time,UNIX_TIMESTAMP(lead(time) over(partition by name order by time),'yyyy-MM-dd HH:mm:ss')- UNIX_TIMESTAMP(time,'yyyy-MM-dd HH:mm:ss') period,url from user").show() sc.stop() } }
作者:weixin_38677730



姓名 数据 Mysql

需要 登录 后方可回复, 如果你还没有账号请 注册新账号