首页 👨‍💻计算机,🐍Python

乐高淘宝数据可视化

之前学习前端的时候了解过echarts,他做出来的可视化很好看,今天正好有这个机会,试着做一做

[btnblue href="https://lavines.gitee.io/show" target="blank"]效果链接[/btnblue]

image-20201010205714516

1,导入模块

import pandas as pd
import numpy as np
import jieba 
import time
import stylecloud
from IPython.display import Image
from pyecharts.charts import Bar,Line,Map,Page,Pie
from pyecharts import options as opts
from pyecharts.globals import SymbolType

2,获取数据

df_tb=pd.read_csv("F:\桌面\数据分析项目练习\legao3225\乐高淘宝数据.csv")
df_tb.head()

image-20201011134846590

df_tb.info()

image-20201011134906574

3,数据处理

  • 去重复值
  • goods_name:暂不处理
  • shop_name:暂不处理
  • price:暂不处理
  • purchase_num:暂不处理
  • 计算销售额= price*purchase_num
  • location:提取省份
# 去除重复值
df_tb.drop_duplicates(inplace=True)
# 删除购买人数为空的记录
df_tb=df_tb[df_tb['purchase_num'].str.contains('人付款')]

# 重置索引
df_tb=df_tb.reset_index(drop=True)
df_tb.info()

image-20201011134927606

# purchase_num 处
df_tb['purchase_num']=df_tb['purchase_num'].str.extract('(\d+)').astype('int')
# 计算销售额
df_tb['sales_volume']=df_tb['price']*df_tb['purchase_num']
# location
df_tb['province']=df_tb['location'].str.split(' ').str[0]
df_tb.head()

image-20201011134948862

4,数据可视化

1.乐高销量排名top10店铺-条形图

   2.乐高产地数量排名top10–条形图
   3.乐高产地国内销量分布–地图
   4.价格分布–条形图
   5.不同价格区间的销量表现–饼图
   6.商品标题词云图-词云图
shop_top10=df_tb.groupby('shop_name')['purchase_num'].sum().sort_values(ascending=False).head(10)
shop_top10

image-20201011135853052

1.乐高销量排名top10店铺-条形图
# 条形图
# bar1=Bar(init_opts=opts.InitOpts(width='1350px',height='750px'))
bar1=Bar()
bar1.add_xaxis(shop_top10.index.tolist())
bar1.add_yaxis('',shop_top10.values.tolist())
bar1.set_global_opts(title_opts=opts.TitleOpts(title='乐高销量排名Top10淘宝店铺'),
                     xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
                     visualmap_opts=opts.VisualMapOpts(max_=28669)
                    )
bar1.render_notebook()

image-20201011140035274

​ 1.乐高产地数量排名top10–条形图

province_top10=df_tb.province.value_counts()[:10]
province_top10

image-20201011140350238

2.乐高产地数量排名top10–条形图
bar2=Bar()
bar2.add_xaxis(province_top10.index.tolist())
bar2.add_yaxis('',province_top10.values.tolist())
bar2.set_global_opts(title_opts=opts.TitleOpts(title="不同价格区间的商品数量"),
        visualmap_opts=opts.VisualMapOpts(max_=1000)
    )
bar2.render_notebook()

image-20201011140421144

province_num=df_tb.groupby('province')['purchase_num'].sum().sort_values(ascending=False)

province_num[:10]      

image-20201011140703308

3.乐高产地国内销量分布–地图
map1=Map()
map1.add("",[list(z) for z in zip(province_num.index.tolist(),province_num.values.tolist())],
        maptype='china')
map1.set_global_opts(
    title_opts=opts.TitleOpts(title='国内各产地乐高销量分布图'),
    visualmap_opts=opts.VisualMapOpts(max_=172277)
)
map1.render_notebook()

image-20201011140746136

cut_bins=[0,50,100,200,300,500,1000,8888]
cut_labels=['0~50元','50~100','100~200元','200~300元','300~500元','500~1000元','1000元以上']

price_cut=pd.cut(df_tb['price'],bins=cut_bins,labels=cut_labels)
price_num=price_cut.value_counts()
price_num

image-20201011140928205

4.天猫乐高价格分布–条形图
bar3=Bar()
bar3.add_xaxis(['0~50元','50~100元','100~200元','200~300 ','300~500元','500~1000元','1000元以上'])
bar3.add_yaxis('',[895,486,701,288,370,411,260])
bar3.set_global_opts(title_opts=opts.TitleOpts(title="不同价格区间的商品数量"),
        visualmap_opts=opts.VisualMapOpts(max_=900)
    )
bar3.render_notebook()

image-20201011141243791

5.不同价格区间的销量表现–饼图
df_tb['price_cut']=price_cut
cut_purchase=df_tb.groupby('price_cut')['sales_volume'].sum()
cut_purchase

image-20201011141347883

data_pair=[list(z) for z in zip(cut_purchase.index.tolist(),cut_purchase.values.tolist())]
# 绘制饼图

piel=Pie()
piel.add("",data_pair,radius=['35%','60%'])
piel.set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"])
piel.set_global_opts(title_opts=opts.TitleOpts(title="Pie-设置颜色"),
                    legend_opts=opts.LegendOpts(orient='vertical',pos_top='15%',pos_left='2%'))

piel.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
piel.render_notebook()

image-20201011141015230

6.商品标题词云图-词云图

词云图参数设置,根据自己喜好进行更改

  • text:输入文本。
  • file_path:输入文本/CSV 的文件路径。
  • gradient:梯度方向(其默认值是 None,如果它的值不是 None,则 stylecloud 使用了方向性梯度)[default: None]
  • size:stylecloud 的大小(长度和宽度)[default: 512]
  • icon_name:stylecloud 形状的图标名称(如 fas fa-grin)[default: fas fa-flag]
  • palette:调色板(通过 palettable 实现)[default: cartocolors.qualitative.Bold_6]
  • background_color:背景颜色。[default: white]
  • max_font_size:stylecloud 中的最大字号。[default: 200]
  • max_words:stylecloud 可包含的最大单词数。[default: 2000]
  • stopwords:布尔值,用于筛除常见禁用词。[default: True]
  • output_name:stylecloud 的输出文本名。[default: stylecloud.png]
  • font_path:stylecloud 所用字体 .ttf 文件的路径。[default: uses included Staatliches font]
  • random_state:控制单词和颜色的随机状态。
def get_cut_words(content_series):
    # 读入停用图表析
    stop_words=[]
    with open("F:\桌面\数据分析项目练习\legao3225\cn_stopwords.txt",'r',encoding='utf-8')as f:
        lines=f.readlines()
        for line in lines:
            stop_words.append(line.strip())
    # 添加关键词
    my_words=['乐高','悟空小侠','大颗粒','小颗粒']
    for i in my_words:
        jieba.add_word(i)
    # 自定义停用词
    # my_stop_words=[]
    # stop_words.extend(my_stop_words)

    # 分词
    word_num=jieba.lcut(content_series.str.cat(sep='。'),cut_all=False)
    # 条件筛选
    word_num_selected=[i for i in word_num if i not in stop_words and len(i)>=2]
    return  word_num_selected
text=get_cut_words(content_series=df_tb['goods_name'])
text[:10]

image-20201011141500891

# 绘制词云图
stylecloud.gen_stylecloud(
    text=' '.join(text),
    collocations=False,
    font_path=r'G:\新的站点\数据分析可视化\simhei.ttf',
    icon_name='fas fa-plane',
    size=768,
    background_color="#040f3c",
    output_name='淘宝乐高标题词云图.png'
)
Image(filename='淘宝乐高标题词云图.png')

image-20201011141607974

5,完整代码

# 导入模块
import pandas as pd
import numpy as np
import jieba 
import time
from pyecharts.charts import Bar,Line,Map,Page,Pie
from pyecharts import options as opts
from pyecharts.globals import SymbolType
# 读取数据
df_tb=pd.read_csv("F:\桌面\数据分析项目练习\legao3225\乐高淘宝数据.csv")
df_tb.head()
df_tb.info()
# 去除重复值
df_tb.drop_duplicates(inplace=True)
# 删除购买人数为空的记录
df_tb=df_tb[df_tb['purchase_num'].str.contains('人付款')]

# 重置索引
df_tb=df_tb.reset_index(drop=True)
df_tb.info()
# purchase_num 处
df_tb['purchase_num']=df_tb['purchase_num'].str.extract('(\d+)').astype('int')
# 计算销售额
df_tb['sales_volume']=df_tb['price']*df_tb['purchase_num']

# location
df_tb['province']=df_tb['location'].str.split(' ').str[0]
df_tb.head()
shop_top10=df_tb.groupby('shop_name')['purchase_num'].sum().sort_values(ascending=False).head(10)
shop_top10
1.乐高销量排名top10店铺-条形图
# bar1=Bar(init_opts=opts.InitOpts(width='1350px',height='750px'))
bar1=Bar()
bar1.add_xaxis(shop_top10.index.tolist())
bar1.add_yaxis('',shop_top10.values.tolist())
bar1.set_global_opts(title_opts=opts.TitleOpts(title='乐高销量排名Top10淘宝店铺'),
                     xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
                     visualmap_opts=opts.VisualMapOpts(max_=28669)
                    )
bar1.render("乐高销量排名Top10淘宝店铺.html")

province_top10=df_tb.province.value_counts()[:10]
province_top10
2.乐高产地数量排名top10–条形图
bar2=Bar()
bar2.add_xaxis(province_top10.index.tolist())
bar2.add_yaxis('',province_top10.values.tolist())
bar2.set_global_opts(
        title_opts=opts.VisualMapOpts(max_=1000)
                    )
bar2.render("乐高产地数量排名top10.html")
province_num=df_tb.groupby('province')['purchase_num'].sum().sort_values(ascending=False)
province_num[:10]
3.乐高产地国内销量分布–地图
map1=Map()
map1.add("",[list(z) for z in zip(province_num.index.tolist(),province_num.values.tolist())],
        maptype='china')
map1.set_global_opts(
    title_opts=opts.TitleOpts(title='国内各产地乐高销量分布图'),
    visualmap_opts=opts.VisualMapOpts(max_=172277)
)
map1.render("国内各产地乐高销量分布图.html")
cut_bins=[0,50,100,200,300,500,1000,8888]
cut_labels=['0~50元','50~100','100~200元','200~300元','300~500元','500~1000元','1000元以上']

price_cut=pd.cut(df_tb['price'],bins=cut_bins,labels=cut_labels)
price_num=price_cut.value_counts()
price_num
4.天猫乐高价格分布–条形图
bar3=Bar()
bar3.add_xaxis(['0~50元','100~200元','50~100 ','500~1000元','300~500元','200~300元','1000元以上'])
bar3.add_yaxis('',[895,486,701,288,370,411,260])
bar3.set_global_opts(title_opts=opts.TitleOpts(title="不同价格区间的商品数量"),
        visualmap_opts=opts.VisualMapOpts(max_=900)
    )
bar3.render("不同价格区间的商品数量.html")
5.不同价格区间的销量表现–饼图
df_tb['price_cut']=price_cut
cut_purchase=df_tb.groupby('price_cut')['sales_volume'].sum()
cut_purchase
data_pair=[list(z) for z in zip(cut_purchase.index.tolist(),cut_purchase.values.tolist())]
# 绘制饼图
piel=Pie()
piel.add("",data_pair,radius=['35%','60%'])
piel.set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"])
piel.set_global_opts(title_opts=opts.TitleOpts(title="Pie-设置颜色"),
                    legend_opts=opts.LegendOpts(orient='vertical',pos_top='15%',pos_left='2%'))
piel.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
piel.render_notebook()
6.商品标题词云图-词云图
def get_cut_words(content_series):
  shop_words=[]
    with open("F:\桌面\数据分析项目练习\legao3225\cn_stopwords.txt",'r',encoding='utf-8') as f:
      lines=f.readlines()
      for line in lines:
        stop_words.append(line.strip())
  my_words=['乐高','悟空小侠','大颗粒','小颗粒']
  for i in my_words:
    jieba.add_word(i)
    word_num=jieba.lcut(content_series.str.cat(sep='。'),cut_all=False)
    word_num_selected=[i for i in word_num if i not in stop_words and in len(i)>=2]
return word_num_selected
text =get_cut_words(content_series=df_tb['goobs_name'])
text[:10]
stylecloud.gen_stylecloud(
    text=' '.join(text),
    collocations=False,
    font_path=r'G:\新的站点\数据分析可视化\simhei.ttf',
    icon_name='fas fa-plane',
    size=768,
    background_color="#040f3c",
    output_name='淘宝乐高标题词云图.png'
)
Image(filename='淘宝乐高标题词云图.png')



文章评论

目录