你的科技基该换了——用统计方法对基金做相关性分析

基金的数据分析

　　你是否搞不清基金的种类，是大盘蓝筹型还是小盘科技型？你是否被基金的名字迷惑，以为华安媒体互联网混合就真的是投资互联网的？你是否洋洋洒洒地买了七八个类似的科技基金，还美其名曰分散投资？如果你搞不清楚基金之间的关系，那么你找对人了，你需要做一些技术分析。
　　如何判断两个基金的走势是否一致呢？笔者帮大家想了一种方法：皮尔逊相关系数分析，(具体的计算公式在这里就不放了，想了解的可以自己去查阅相关资料)。简单来说，就是选取不同基金相同时间段的净值走势，然后计算它们两两之间的相关系数矩阵，画出来大概是这样一个效果：

　　上图中，相关性越大(越接近1)的基金表示涨跌越一致(如天弘创业板ETF联结C和华宝科技ETF联结C)，相关性越小的基金表示越无关。这时你就会惊讶的发现，华安媒体互联网混合与计算机指数的相关性很低(0.76)，却与新能源的相关性极高(0.96)！所以为了起到真正分散风险的作用，还是推荐大家配置一些毫！无！关！系！的基金。
　　当然了，也不是说相关性小的都要配置一些，还是要有所选择。比如最近平均亏损达到50%以上的原油，那还是不碰为妙吧。

如何计算自己手中基金的相关性

　　下面的这部分就是给程序员朋友看的了。先把自己手中的基金代码都记录下来，然后到天天基金网上去爬历史净值数据，之后用numpy计算它们之间的相关系数矩阵，最后用seaborn画热力图就行啦。
　　下面是一个完整的脚本。

# -*- coding: utf-8 -*
import json
import re

import requests
from matplotlib import pyplot as plt
import time
import seaborn as sns
import numpy as np

plt.rcParams['font.family'] = ['Hei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号


def get_local_time(timestamp):
    time_local = time.localtime(timestamp)
    dt = time.strftime("%Y/%m/%d", time_local)
    return dt


def get_name(code):
    url = 'http://fund.eastmoney.com/pingzhongdata/%s.js' % code

    try:
        response = requests.get(url)
    except:
        return [], [], []

    if response.status_code == 200:
        text = response.text
        pattern = 'fS_name = "(.*?)"'
        name = re.findall(pattern, text)[0]

        return name
    return ''


def get_k_line(code):
    url = 'http://fund.eastmoney.com/pingzhongdata/%s.js' % code

    try:
        response = requests.get(url)
    except:
        return [], [], []

    if response.status_code == 200:
        text = response.text
        pattern = 'fS_name = "(.*?)"'
        name = re.findall(pattern, text)[0]

        pattern = 'var Data_netWorthTrend = \[(.*?)\]'
        worth_list = re.findall(pattern, text)[0]

        worth_list = json.loads('[%s]' % worth_list)

        worth = dict()
        worth_values = []
        date_list = []
        k_values = []
        rates = []
        # for i in range(len(worth_list)-1, -1, -1):
        for i in range(len(worth_list)):

            x = worth_list[i]
            worth[get_local_time(int(x['x']) // 1000)] = x['y']
            value = x['y']

            worth_values.append(value)
            date_list.append(get_local_time(int(x['x']) // 1000))
            if i == 0:
                start = value
            else:
                start = worth_list[i - 1]['y']

            rates.append(round((value - start) / start * 100, 2))
            line = [start, value, min(start, value), max(start, value)]
            k_values.append(line)

        return name, worth_values, rates

    else:
        return [], [], []


ids = ['001156', '320007', '001071', '005918', '001630', '001593', '001618', '000834', '006476', '161725', '007874']
names = [get_name(idx)[:6] for idx in ids]
value = [get_k_line(idx)[1][-120:] for idx in ids]
corrcoef = np.corrcoef(value)



fig, ax = plt.subplots(figsize=(9, 9))

sns.heatmap(np.round(corrcoef, 2), annot=True, vmax=1, vmin=0, xticklabels=True, yticklabels=True,
            square=True)

ax.set_yticklabels(names, fontsize=12, rotation=360, horizontalalignment='right')
names = [name[:3] + '\n' + name[3:] for name in names]
ax.set_xticklabels(names, fontsize=12)

plt.show()

100

101

102

# -*- coding: utf-8 -*

import json

import re

import requests

from matplotlib import pyplot as plt

import time

import seaborn as sns

import numpy as np

plt.rcParams['font.family'] = ['Hei'] # 用来正常显示中文标签

plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

def get_local_time(timestamp):

time_local = time.localtime(timestamp)

dt = time.strftime("%Y/%m/%d", time_local)

return dt

def get_name(code):

url = 'http://fund.eastmoney.com/pingzhongdata/%s.js' % code

try:

response = requests.get(url)

except:

return [], [], []

if response.status_code == 200:

text = response.text

pattern = 'fS_name = "(.*?)"'

name = re.findall(pattern, text)[0]

return name

return ''

def get_k_line(code):

url = 'http://fund.eastmoney.com/pingzhongdata/%s.js' % code

try:

response = requests.get(url)

except:

return [], [], []

if response.status_code == 200:

text = response.text

pattern = 'fS_name = "(.*?)"'

name = re.findall(pattern, text)[0]

pattern = 'var Data_netWorthTrend = \[(.*?)\]'

worth_list = re.findall(pattern, text)[0]

worth_list = json.loads('[%s]' % worth_list)

worth = dict()

worth_values = []

date_list = []

k_values = []

rates = []

# for i in range(len(worth_list)-1, -1, -1):

for i in range(len(worth_list)):

x = worth_list[i]

worth[get_local_time(int(x['x']) // 1000)] = x['y']

value = x['y']

worth_values.append(value)

date_list.append(get_local_time(int(x['x']) // 1000))

if i == 0:

start = value

else:

start = worth_list[i - 1]['y']

rates.append(round((value - start) / start * 100, 2))

line = [start, value, min(start, value), max(start, value)]

k_values.append(line)

return name, worth_values, rates

else:

return [], [], []

ids = ['001156', '320007', '001071', '005918', '001630', '001593', '001618', '000834', '006476', '161725', '007874']

names = [get_name(idx)[:6] for idx in ids]

value = [get_k_line(idx)[1][-120:] for idx in ids]

corrcoef = np.corrcoef(value)

fig, ax = plt.subplots(figsize=(9, 9))

sns.heatmap(np.round(corrcoef, 2), annot=True, vmax=1, vmin=0, xticklabels=True, yticklabels=True,

square=True)

ax.set_yticklabels(names, fontsize=12, rotation=360, horizontalalignment='right')

names = [name[:3] + '\n' + name[3:] for name in names]

ax.set_xticklabels(names, fontsize=12)

plt.show()