当前位置: 首页 >数据库 > 第六周-电子商务网站用户行为分析

第六周-电子商务网站用户行为分析

访问数据库

import osimport pandas as pd# 修改工作路径到指定文件夹#os.chdir("D:/chapter11/demo")# 第一种连接方式#from sqlalchemy import create_engine#engine = create_engine('mysql+pymysql://root:123@192.168.31.140:3306/test?charset=utf8')#sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 第二种连接方式import pymysql as pmcon = pm.connect(host='localhost',user='root',password='123456',database='test',charset='utf8')data = pd.read_sql('select * from all_gzdata',con=con)con.close()#关闭连接# 保存读取的数据data.to_csv("D:\\360MoveData\\Users\\86130\\Documents\\Tencent Files\\2268756693\\FileRecv\\all_gzdata.csv", index=False, encoding='utf-8')

网页类型设计

import pandas as pdfrom sqlalchemy import create_engineengine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分析网页类型counts = [i['fullURLId'].value_counts() for i in sql] #逐块统计counts = counts.copy()counts = pd.concat(counts).groupby(level=0).sum()  # 合并统计结果,把相同的统计项合并(即按index分组并求和)counts = counts.reset_index()  # 重新设置index,将原来的index作为counts的一列。counts.columns = ['index', 'num']  # 重新设置列名,主要是第二列,默认为0counts['type'] = counts['index'].str.extract('(\d{3})')  # 提取前三个数字作为类别idcounts_ = counts[['type', 'num']].groupby('type').sum()  # 按类别合并counts_.sort_values(by='num', ascending=False, inplace=True)  # 降序排列counts_['ratio'] = counts_.iloc[:,0] / counts_.iloc[:,0].sum()print(counts_)
 num ratiotype  101411665  0.491570199201426  0.240523107182900  0.21840130118430  0.02200710217357  0.020726106 3957  0.004725103 1715  0.002048

知识类型内部统计
# 因为只有107001一类,但是可以继续细分成三类:知识内容页、知识列表页、知识首页def count107(i): #自定义统计函数j = i[['fullURL']][i['fullURLId'].str.contains('107')].copy()  # 找出类别包含107的网址j['type'] = None # 添加空列j['type'][j['fullURL'].str.contains('info/.+?/')]= '知识首页'j['type'][j['fullURL'].str.contains('info/.+?/.+?')]= '知识列表页'j['type'][j['fullURL'].str.contains('/\d+?_*\d+?\.html')]= '知识内容页'retu j['type'].value_counts()# 注意:获取一次sql对象就需要重新访问一下数据库(!!!)#engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)counts2 = [count107(i) for i in sql] # 逐块统计counts2 = pd.concat(counts2).groupby(level=0).sum()  # 合并统计结果print(counts2)#计算各个部分的占比res107 = pd.DataFrame(counts2)# res107.reset_index(inplace=True)res107.index.name= '107类型'res107.rename(columns={'type':'num'}, inplace=True)res107['比例'] = res107['num'] / res107['num'].sum()res107.reset_index(inplace = True)print(res107)
知识内容页164243知识列表页  9656知识首页9001Name: type, dtype: int64107类型 num比例0  知识内容页  164243  0.8979931  知识列表页9656  0.0527942知识首页9001  0.049213

统计带“?”的数据

def countquestion(i):  # 自定义统计函数j = i[['fullURLId']][i['fullURL'].str.contains('\?')].copy()  # 找出类别包含107的网址retu j# 注意获取一次sql对象就需要重新访问一下数据库engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)counts3 = [countquestion(i)['fullURLId'].value_counts() for i in sql]counts3 = pd.concat(counts3).groupby(level=0).sum()print(counts3)# 求各个类型的占比并保存数据df1 =  pd.DataFrame(counts3)df1['perc'] = df1['fullURLId']/df1['fullURLId'].sum()*100df1.sort_values(by='fullURLId',ascending=False,inplace=True)print(df1.round(4))
1010034710200225107001346199900164718301001356Name: fullURLId, dtype: int64 fullURLId perc1999001  64718  98.8182301001 3560.5436107001 3460.5283101003  470.0718102002  250.0382

统计199类型中的具体类型占比

def page199(i): #自定义统计函数j = i[['fullURL','pageTitle']][(i['fullURLId'].str.contains('199')) &  (i['fullURL'].str.contains('\?'))]j['pageTitle'].fillna('空',inplace=True)j['type'] = '其他' # 添加空列j['type'][j['pageTitle'].str.contains('法律快车-律师助手')]= '法律快车-律师助手'j['type'][j['pageTitle'].str.contains('咨询发布成功')]= '咨询发布成功'j['type'][j['pageTitle'].str.contains('免费发布法律咨询' )] = '免费发布法律咨询'j['type'][j['pageTitle'].str.contains('法律快搜')] = '快搜'j['type'][j['pageTitle'].str.contains('法律快车法律经验')] = '法律快车法律经验'j['type'][j['pageTitle'].str.contains('法律快车法律咨询')] = '法律快车法律咨询'j['type'][(j['pageTitle'].str.contains('_法律快车')) | (j['pageTitle'].str.contains('-法律快车'))] = '法律快车'j['type'][j['pageTitle'].str.contains('空')] = '空'retu j# 注意:获取一次sql对象就需要重新访问一下数据库#engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息#sql = pd.read_sql_query('select * from all_gzdata limit 10000', con=engine)counts4 = [page199(i) for i in sql] # 逐块统计counts4 = pd.concat(counts4)d1 = counts4['type'].value_counts()print(d1)d2 = counts4[counts4['type']=='其他']print(d2)# 求各个部分的占比并保存数据df1_ =  pd.DataFrame(d1)df1_['perc'] = df1_['type']/df1_['type'].sum()*100df1_.sort_values(by='type',ascending=False,inplace=True)print(df1_)
法律快车-律师助手49894法律快车法律咨询  6421咨询发布成功5220快搜1943法律快车818其他 359法律快车法律经验59空4Name: type, dtype: int64fullURL  \2631  http://www.lawtime.cn/spelawyer/index.php?py=g...2632  http://www.lawtime.cn/spelawyer/index.php?py=g...1677  http://m.baidu.com/from=844b/bd_page_type=1/ss...4303  http://m.baidu.com/from=0/bd_page_type=1/ssid=...3673  http://www.lawtime.cn/lawyer/lll25879862593080...... ...4829  http://www.lawtime.cn/spelawyer/index.php?m=se...4837  http://www.lawtime.cn/spelawyer/index.php?m=se...4842  http://www.lawtime.cn/spelawyer/index.php?m=se...8302  http://www.lawtime.cn/spelawyer/index.php?m=se...5034  http://www.baidu.com/link?url=O7iBD2KmoJdkHWTZ...pageTitle type  2631个旧律师成功案例 - 法律快车提供个旧知名律师、优秀律师、专业律师的咨询和推荐其他  2632个旧律师成功案例 - 法律快车提供个旧知名律师、优秀律师、专业律师的咨询和推荐其他  1677  婚姻法论文 - 法律快车法律论文其他  4303什么是机动车?什么是非机动车? - 法律快车交通事故其他  3673  404错误提示页面 - 法律快车其他  ......  ...  4829  律师搜索,律师查找 - 法律快车提供全国知名律师、优秀律师、专业律师的咨询和推荐其他  4837  律师搜索,律师查找 - 法律快车提供全国知名律师、优秀律师、专业律师的咨询和推荐其他  4842  律师搜索,律师查找 - 法律快车提供全国知名律师、优秀律师、专业律师的咨询和推荐其他  8302  律师搜索,律师查找 - 法律快车提供全国知名律师、优秀律师、专业律师的咨询和推荐其他  5034 离婚协议书范本(2015年版) - 法律快车婚姻法其他  [359 rows x 3 columns]typeperc法律快车-律师助手  49894  77.094471法律快车法律咨询64219.921506咨询发布成功  52208.065762快搜  19433.002256法律快车 8181.263945其他3590.554714法律快车法律经验  590.091165空  40.006181

统计无目的浏览用户中各个类型占比

def xiaguang(i): #自定义统计函数j = i.loc[(i['fullURL'].str.contains('\.html'))==False,  ['fullURL','fullURLId','pageTitle']]retu j# 注意获取一次sql对象就需要重新访问一下数据库engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息counts5 = [xiaguang(i) for i in sql]counts5 = pd.concat(counts5)xg1 = counts5['fullURLId'].value_counts()print(xg1)# 求各个部分的占比xg_ =  pd.DataFrame(xg1)xg_.reset_index(inplace=True)xg_.columns= ['index', 'num']xg_['perc'] = xg_['num']/xg_['num'].sum()*100xg_.sort_values(by='num',ascending=False,inplace=True)xg_['type'] = xg_['index'].str.extract('(\d{3})') #提取前三个数字作为类别idxgs_ = xg_[['type', 'num']].groupby('type').sum() #按类别合并xgs_.sort_values(by='num', ascending=False,inplace=True) #降序排列xgs_['percentage'] = xgs_['num']/xgs_['num'].sum()*100print(xgs_.round(4))

统计用户浏览网页次数的情况

# 统计点击次数engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息counts1 = [i['realIP'].value_counts() for i in sql] # 分块统计各个IP的出现次数counts1 = pd.concat(counts1).groupby(level=0).sum() # 合并统计结果,level=0表示按照index分组print(counts1)counts1_ = pd.DataFrame(counts1)counts1_counts1['realIP'] = counts1.index.tolist()counts1_[1]=1  # 添加1列全为1hit_count = counts1_.groupby('realIP').sum()  # 统计各个“不同点击次数”分别出现的次数# 也可以使用counts1_['realIP'].value_counts()功能hit_count.columns=['用户数']hit_count.index.name = '点击次数'# 统计1~7次、7次以上的用户人数hit_count.sort_index(inplace = True)hit_count_7 = hit_count.iloc[:7,:]time = hit_count.iloc[7:,0].sum()  # 统计点击次数7次以上的用户数hit_count_7 = hit_count_7.append([{'用户数':time}], ignore_index=True)hit_count_7.index = ['1','2','3','4','5','6','7','7次以上']hit_count_7['用户比例'] = hit_count_7['用户数'] / hit_count_7['用户数'].sum()print(hit_count_7)
82033 295502 1103182111601021362061 ..4294809358242948111501429485215434294865422242949176901Name: realIP, Length: 230149, dtype: int64 用户数  用户比例1 132119  0.5740592  44175  0.1919413  17573  0.0763554  10156  0.04412855952  0.02586264132  0.01795472632  0.0114367次以上13410  0.058267

分析浏览一次的用户行为

# 初始化数据库连接:engine = create_engine('mysql+pymysql://root:123456@localhost:3306/test?charset=utf8')sql = pd.read_sql('all_gzdata', engine, chunksize=1024 * 5)# 分块统计各个IP的点击次数result = [i['realIP'].value_counts() for i in sql]click_count = pd.concat(result).groupby(level=0).sum()click_count = click_count.reset_index()click_count.columns = ['realIP', 'times']# 筛选出来点击一次的数据click_one_data = click_count[click_count['times'] == 1]# 这里只能再次读取数据 因为sql是一个生成器类型,所以在使用过一次以后,就不能继续使用了。必须要重新执行一次读取。sql = pd.read_sql('all_gzdata', engine, chunksize=1024 * 5)# 取出这三列数据data = [i[['fullURLId', 'fullURL', 'realIP']] for i in sql]data = pd.concat(data)# 和并数据 我以click_one_data为基准 按照realIP合并过来,目的方便查看点击一次的网页和realIPmerge_data = pd.merge(click_one_data, data, on='realIP', how='left')# 点击一次的数据统计 写入数据库 以方便读取 校准无误 写入后就可以注释掉此句代码#erge_data.to_sql('click_one_count', engine, if_exists='append')print(merge_data)# 统计排名前4和其他的网页类型URL_count_4 = URL_count.iloc[:4,:]time = hit_count.iloc[4:,0].sum()  # 统计其他的URLindex = URL_count_4.index.valuesURL_count_4 = URL_count_4.append([{'count':time}], ignore_index=True)URL_count_4.index = [URLindex[0], URLindex[1], URLindex[2], URLindex[3],  '其他']URL_count_4['比例'] = URL_count_4['count'] / URL_count_4['count'].sum()print(URL_count_4)
realIP  times fullURLId  \095502  11010031103182  11010032136206  11010033140151  11070014155761  1101003............132114  4294737166  1101003132115  4294804343  1101003132116  4294807822  1101003132117  4294811150  1101003132118  4294917690  1101003 fullURL  0 http://www.lawtime.cn/ask/question_7882607.html  1 http://www.lawtime.cn/ask/question_7174864.html  2 http://www.lawtime.cn/ask/question_8246285.html  3http://www.lawtime.cn/info/gongsi/slbgfgs/2011...  4 http://www.lawtime.cn/ask/question_5951952.html  ......  132114http://www.lawtime.cn/ask/question_3947040.html  132115http://www.lawtime.cn/ask/question_2064846.html  132116http://www.lawtime.cn/ask/question_9981155.html  132117http://www.lawtime.cn/ask/question_4931163.html  132118http://www.lawtime.cn/ask/question_6910223.html  [132119 rows x 4 columns]  count比例101003102560  0.64901110700119443  0.12303719990019381  0.059364301001  515  0.003259其他26126  0.165328

统计单用户浏览次数为一次的网页

# 在浏览1次的前提下, 得到的网页被浏览的总次数fullURL_count = pd.DataFrame(real_one.groupby("fullURL")["fullURL"].count())fullURL_count.columns = ["count"]fullURL_count["fullURL"] = fullURL_count.index.tolist()fullURL_count.sort_values(by='count', ascending=False, inplace=True)  # 降序排列# 网页类型ID统计fullURLId_count = merge_data['fullURLId'].value_counts()fullURLId_count = fullURLId_count.reset_index()fullURLId_count.columns = ['fullURLId', 'count']fullURLId_count['percent'] = fullURLId_count['count'] / fullURLId_count['count'].sum() * 100print('*****' * 10)print(fullURLId_count)# 用户点击一次 浏览的网页统计fullURL_count = merge_data['fullURL'].value_counts()fullURL_count = fullURL_count.reset_index()fullURL_count.columns = ['fullURL', 'count']fullURL_count['percent'] = fullURL_count['count'] / fullURL_count['count'].sum() * 100print('*****' * 10)print(fullURL_count)
**************************************************fullURLIdcountpercent0 101003  102560  77.6269881 10700119443  14.7162792199900193817.1004173 301001 5150.3898004 102001  700.0529835 103003  450.0340606 101002  330.0249777 101001  280.0211938 102002  130.0098409 106001  130.0098401010100940.0030281110100430.0022711210100730.0022711310100820.0015141410200320.0015141510100510.0007571610200410.0007571710100610.0007571810200610.000757************************************************** fullURL  countpercent0  http://www.lawtime.cn/info/shuifa/slb/20121119...1013  0.7667331  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...501  0.3792042 http://www.lawtime.cn/ask/question_925675.html423  0.3201663  http://www.lawtime.cn/info/shuifa/slb/20121119...367  0.2777804http://www.lawtime.cn/ask/exp/13655.html301  0.227825...  .........88030http://www.lawtime.cn/ask/question_3357263.html  1  0.00075788031  http://www.lawtime.cn/info/laodong/laodonganli...  1  0.00075788032  http://www.lawtime.cn/info/lunwen/ipzhuzuo/201...  1  0.00075788033 http://www.lawtime.cn/ask/question_307554.html  1  0.00075788034http://www.lawtime.cn/ask/question_10467655.html  1  0.000757[88035 rows x 3 columns]

 

删除不符合规则的网页

import osimport reimport pandas as pdimport pymysql as pmfrom random import sample# 修改工作路径到指定文件夹os.chdir("D:\\360MoveData\\Users\\86130\\Documents\\Tencent Files\\2268756693\\FileRecv")# 读取数据con = pm.connect(host='localhost',user='root',password='123456',database='test',charset='utf8')data = pd.read_sql('select * from all_gzdata',con=con)con.close()  # 关闭连接# 取出107类型数据index107 = [re.search('107',str(i))!=None for i in data.loc[:,'fullURLId']]data_107 = data.loc[index107,:]# 在107类型中筛选出婚姻类数据index = [re.search('hunyin',str(i))!=None for i in data_107.loc[:,'fullURL']]data_hunyin = data_107.loc[index,:]# 提取所需字段(realIP、fullURL)info = data_hunyin.loc[:,['realIP','fullURL']]# 去除网址中“?”及其后面内容da = [re.sub('\?.*','',str(i)) for i in info.loc[:,'fullURL']]info.loc[:,'fullURL'] = da # 将info中‘fullURL’那列换成da# 去除无html网址index = [re.search('\.html',str(i))!=None for i in info.loc[:,'fullURL']]index.count(True)# True 或者 1 , False 或者 0info1 = info.loc[index,:]print("(学号 3110)去除无html网址如下:")print(info1)
realIPfullURL02683657840  http://www.lawtime.cn/info/hunyin/hunyinfagui/...42683657840  http://www.lawtime.cn/info/hunyin/hunyinfagui/...91275347569  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...62  1531496412  http://www.lawtime.cn/info/hunyin/hunyinfagui/...86838215995  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20............837347  2320911216  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...837362  3458366734  http://www.lawtime.cn/info/hunyin/jhsy/daiyun/...837370  2526756791  http://www.lawtime.cn/info/hunyin/hynews/20101...837376  4267065457  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...837434  3271035001  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...[31199 rows x 2 columns]

还原翻译网址

# 找出翻页和非翻页网址index = [re.search('/\d+_\d+\.html',i)!=None for i in info1.loc[:,'fullURL']]index1 = [i==False for i in index]info1_1 = info1.loc[index,:]# 带翻页网址info1_2 = info1.loc[index1,:]  # 无翻页网址# 将翻页网址还原da = [re.sub('_\d+\.html','.html',str(i)) for i in info1_1.loc[:,'fullURL']]info1_1.loc[:,'fullURL'] = da# 翻页与非翻页网址合并frames = [info1_1,info1_2]info2 = pd.concat(frames)# 或者info2 = pd.concat([info1_1,info1_2],axis = 0)# 默认为0,即行合并# 去重(realIP和fullURL两列相同)info3 = info2.drop_duplicates()# 将IP转换成字符型数据info3.iloc[:,0] = [str(index) for index in info3.iloc[:,0]]info3.iloc[:,1] = [str(index) for index in info3.iloc[:,1]]print("(学号 3110)还原的翻译网址如下:")print(info3)len(info3)
realIPfullURL02683657840  http://www.lawtime.cn/info/hunyin/hunyinfagui/...86838215995  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...98  1531496412  http://www.lawtime.cn/info/hunyin/hunyinfagui/...130  923358328  http://www.lawtime.cn/info/hunyin/zhonghun/zho...140 1275347569  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20............837191  3897562894  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...837362  3458366734  http://www.lawtime.cn/info/hunyin/jhsy/daiyun/...837370  2526756791  http://www.lawtime.cn/info/hunyin/hynews/20101...837376  4267065457  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...837434  3271035001  http://www.lawtime.cn/info/hunyin/lhlawlhxy/20...[16570 rows x 2 columns]
16570

筛选浏览次数不满两次的用户

# 代码11-12 筛选浏览次数不满两次的用户# 筛选满足一定浏览次数的IPIP_count = info3['realIP'].value_counts()# 找出IP集合IP = list(IP_count.index)count = list(IP_count.values)# 统计每个IP的浏览次数,并存放进IP_count数据框中,第一列为IP,第二列为浏览次数IP_count = pd.DataFrame({'IP':IP,'count':count})print("(学号 3110)")print(IP_count)# 筛选出浏览网址在n次以上的IP集合n = 2index = IP_count.loc[:,'count']>nIP_index = IP_count.loc[index,'IP']print(IP_index)
  IP  count0  26091135278951  38124107441402225896631 593242673847 564  1190924814 48.........10524  3494221838  110525  1219597838  11052649885111  110527  2861434551  110528  2306969614  1[10529 rows x 2 columns]0  26091135271  3812410744222589663132426738474  1190924814  ...86536345009808661519157623867385163326586822133643378691938534819Name: IP, Length: 870, dtype: object

 

划分数据集

# 划分IP集合为训练集和测试集index_tr = sample(range(0,len(IP_index)),int(len(IP_index)*0.8))  # 或者np.random.sampleindex_te = [i for i in range(0,len(IP_index)) if i not in index_tr]IP_tr = IP_index[index_tr]IP_te = IP_index[index_te]# 将对应数据集划分为训练集和测试集index_tr = [i in list(IP_tr) for i in info3.loc[:,'realIP']]index_te = [i in list(IP_te) for i in info3.loc[:,'realIP']]data_tr = info3.loc[index_tr,:]data_te = info3.loc[index_te,:]print("(学号 3110)")print(len(data_tr))IP_tr = data_tr.iloc[:,0]  # 训练集IPurl_tr = data_tr.iloc[:,1]  # 训练集网址IP_tr = list(set(IP_tr))  # 去重处理url_tr = list(set(url_tr))  # 去重处理len(url_tr)
4542

2448

作者:龙鳞20
来源链接:https://www.cnblogs.com/zhilin00/p/17282493.html

版权声明:
1、JavaClub(https://www.javaclub.cn)以学习交流为目的,由作者投稿、网友推荐和小编整理收藏优秀的IT技术及相关内容,包括但不限于文字、图片、音频、视频、软件、程序等,其均来自互联网,本站不享有版权,版权归原作者所有。

2、本站提供的内容仅用于个人学习、研究或欣赏,以及其他非商业性或非盈利性用途,但同时应遵守著作权法及其他相关法律的规定,不得侵犯相关权利人及本网站的合法权利。
3、本网站内容原作者如不愿意在本网站刊登内容,请及时通知本站(javaclubcn@163.com),我们将第一时间核实后及时予以删除。





本文链接:https://www.javaclub.cn/database/117549.html

标签:group by
分享给朋友:

“第六周-电子商务网站用户行为分析” 的相关文章

MySQL表的增删改查(进阶) 2022年05月16日 21:54:11
mysql 查询表 所有字段 2022年06月09日 20:38:57
mysql 查询数据结果计算百分比 2022年06月11日 10:01:40
mysql查询数据库表的数据行数 2022年06月12日 12:49:23
MYSQL查询返回JSON格式的字符串 2022年06月12日 23:10:27
MYSQL优化建议 2022年06月18日 08:23:02
mysql 查询大于多少天的数据 2022年06月22日 12:28:57