``Combination of Rankings'' - The Stationary Distribution
``Combination of Rankings’’ - The Stationary Distribution
Our ‘‘combination of rankings’’ method is essentially the construction of a Markov chain; The random walking of the graph we did, a naive way to estimate its stationary distribution.
I will detail the math of the stationary distribution and the hypotheses required for the Markov chain so that this stationary distribution exists and is unique in a following post. For now, let’s just assume that this stationary distribution exists. If so, then it must verify x = xP, where x is the stationary distribution and P the transition matrix of the (homogeneous) Markov chain. Therefore, we only need to solve this equation to obtain the stationary distribution (and the final combined ranking): No need for the random walk simulation!
In this blog, we empirically show that
-
the naive random walking and solving the linear equation give very close solution (correl ~= 0.99+); Since we use a fixed number of steps for the random walking, it should approximate less well for bigger graphs (more rankings to combine): We indeed notice that correlation between the two methods decrease slightly.
-
solving the equation yields better results (with respect to the true ranking) than random walking the graph;
-
solving the equation is faster
In the next blog, we will show that with some math we can do potentially even better.
import pandas as pd
import numpy as np
import networkx as nx
from scipy.linalg import block_diag
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(seed=42)
def compute_stationary_distribution(P):
"""Returns the stationary distribution of the markov chain.
Solves x = xP, where x is the stationary distribution.
x - xP = 0 <-> x(I - P) = 0, such that sum(x) = 1
:param P: The transition matrix of the markov chain.
"""
n = P.shape[0]
# formulates as a x = b
a = np.eye(n) - P
a = np.vstack((a.T, np.ones(n)))
b = np.matrix([0] * n + [1]).T
return np.linalg.lstsq(a, b)[0]
def build_graph(rankings):
rank_graph = nx.DiGraph()
for ranking, ic in rankings:
for node in ranking.values():
rank_graph.add_node(node)
for ranking, ic in rankings:
for rank_up in range(1, len(ranking)):
for rank_dn in range(rank_up + 1, len(ranking) + 1):
u = ranking[rank_up]
v = ranking[rank_dn]
if u in rank_graph and v in rank_graph[u]:
rank_graph.edges[u, v]['weight'] += ic
else:
rank_graph.add_edge(u, v, weight=ic)
return rank_graph
def build_markov_chain(rank_graph):
for node in rank_graph.nodes():
sum_weights = 0
for adj_node in rank_graph[node]:
sum_weights += rank_graph[node][adj_node]['weight']
for adj_node in rank_graph[node]:
rank_graph.edges[node, adj_node]['weight'] /= sum_weights
return rank_graph
def compute_transition_matrix(rank_graph):
nb_nodes = len(rank_graph.nodes())
transitions = pd.DataFrame(np.zeros((nb_nodes, nb_nodes)),
index=rank_graph.nodes(),
columns=rank_graph.nodes())
for node in rank_graph.nodes():
for adj_node in rank_graph[node]:
transitions.set_value(node, adj_node, rank_graph[node][adj_node]['weight'])
return transitions.div(transitions.sum(axis=1), axis=0)
def random_walk(markov_chain, counts, max_random_steps=1000):
cur_step = 0
current_node = graph_nodes[np.random.randint(len(graph_nodes))]
while cur_step < max_random_steps:
counts[current_node] += 1
neighbours = [(adj, markov_chain[current_node][adj]['weight'])
for adj in markov_chain[current_node]]
adj_nodes = [adj for adj, w in neighbours]
adj_weights = [w for adj, w in neighbours]
if not adj_nodes:
return counts
current_node = np.random.choice(adj_nodes, p=adj_weights)
cur_step += 1
return counts
nb_assets = 250
nb_observations = 1 * 252
residual_returns = np.random.multivariate_normal(
np.zeros(nb_assets), cov=np.eye(nb_assets), size=nb_observations)
residual_returns = pd.DataFrame(residual_returns)
ranked_cumulative_residual_returns = (residual_returns
.sum()
.rank(ascending=False))
nb_experts_available = [
5, 10, 15, 20, 25, 30, 40, 50,
60, 70, 80, 90, 100, 125, 150,
175, 200, 250, 300, 350, 400,
450, 500,
]
results = {}
for nb_experts in nb_experts_available:
prediction_rankings = []
for expert in range(nb_experts):
confidence = np.random.randint(1, 61) / 100
pct_coverage = np.random.randint(2, 51) / 100
expert_coverage = np.random.choice(nb_assets, replace=False,
size=int(nb_assets * pct_coverage))
predictions = []
for node in expert_coverage:
prediction = (confidence * residual_returns[node] +
np.sqrt(1 - confidence**2) * np.random.normal(
0, np.std(residual_returns[node]), nb_observations))
predictions.append(prediction)
expert_predictions = pd.DataFrame(np.array(predictions).T,
columns=expert_coverage)
ranked_cumulated_predictions = (
expert_predictions
.sum()
.rank(ascending=False))
prediction_rankings.append(
({value: key for key, value in dict(ranked_cumulated_predictions).items()},
confidence))
rank_graph = build_graph(prediction_rankings)
markov_chain = build_markov_chain(rank_graph)
transition_matrix = compute_transition_matrix(rank_graph)
stationary_distribution = compute_stationary_distribution(
transition_matrix.values)
eq_ranking = (pd.DataFrame(stationary_distribution,
index=transition_matrix.columns)[0]
.rank(ascending=True))
graph_nodes = list(markov_chain.nodes())
max_random_steps = 10000
counts = {node: 0 for node in graph_nodes}
nb_jumps = 100
for jump in range(nb_jumps):
counts = random_walk(markov_chain, counts, max_random_steps)
# true ranking
true_ranking = sorted([(key, value)
for key, value in dict(ranked_cumulative_residual_returns).items()],
key=lambda x: x[1])
true_ranking = pd.Series([count for node, count in true_ranking],
index=[node for node, count in true_ranking])
# rank graph method
final_ranking = sorted([(node, count) for node, count in counts.items()],
key=lambda x: x[0])
graph_ranking = (pd.Series([count for node, count in final_ranking],
index=[node for node, count in final_ranking])
.rank(ascending=True))
# average ranking
rankings = pd.DataFrame(np.nan,
columns=sorted(graph_nodes),
index=range(len(prediction_rankings)))
for idrank in range(len(prediction_rankings)):
for rank, asset in prediction_rankings[idrank][0].items():
rankings[asset].loc[idrank] = rank / len(prediction_rankings[idrank][0])
sum_weights = sum([prediction_rankings[idrank][1]
for idrank in range(len(prediction_rankings))])
average_rankings = (rankings
.multiply([prediction_rankings[idrank][1] / sum_weights
for idrank in range(len(prediction_rankings))],
axis='rows')
.sum(axis=0))
average_ranking = average_rankings.rank(ascending=True)
results[nb_experts] = {
'avg': average_ranking.corr(true_ranking, method='spearman'),
'graph': graph_ranking.corr(true_ranking, method='spearman'),
'eq': eq_ranking.corr(true_ranking, method='spearman'),
'corr_graph_eq': eq_ranking.corr(graph_ranking, method='spearman'),
'corr_graph_avg': graph_ranking.corr(average_ranking, method='spearman')}
print('number of rankings available:', nb_experts, '\n')
print(results[nb_experts])
print('--------------------------------------\n\n')
/home/gmarti/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead
/home/gmarti/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:16: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
app.launch_new_instance()
number of rankings available: 5
{'avg': 0.27853952574854235, 'graph': 0.28547681249477147, 'eq': nan, 'corr_graph_eq': nan, 'corr_graph_avg': 0.8089651551730394}
--------------------------------------
number of rankings available: 10
{'avg': 0.40457682993752564, 'graph': 0.4564647991823201, 'eq': 0.45614883843479664, 'corr_graph_eq': 0.9997431945445487, 'corr_graph_avg': 0.909035679936199}
--------------------------------------
number of rankings available: 15
{'avg': 0.4360940536338904, 'graph': 0.5043499822580958, 'eq': 0.5066599300427517, 'corr_graph_eq': 0.9995993002718595, 'corr_graph_avg': 0.951359483622102}
--------------------------------------
number of rankings available: 20
{'avg': 0.3760376326021216, 'graph': 0.43245960348275875, 'eq': 0.4344073345173522, 'corr_graph_eq': 0.9995691447757145, 'corr_graph_avg': 0.9235504620944209}
--------------------------------------
number of rankings available: 25
{'avg': 0.40580425286804583, 'graph': 0.4905025607470833, 'eq': 0.4922087073393173, 'corr_graph_eq': 0.9993341328350343, 'corr_graph_avg': 0.9260998808430752}
--------------------------------------
number of rankings available: 30
{'avg': 0.41952325637210186, 'graph': 0.4808010220716799, 'eq': 0.4831000816013056, 'corr_graph_eq': 0.9992412032767914, 'corr_graph_avg': 0.8865637299008844}
--------------------------------------
number of rankings available: 40
{'avg': 0.5807643642298276, 'graph': 0.6020478303247851, 'eq': 0.6041219219507512, 'corr_graph_eq': 0.9991974262349177, 'corr_graph_avg': 0.955521285101716}
--------------------------------------
number of rankings available: 50
{'avg': 0.4981892190275044, 'graph': 0.5219731406647061, 'eq': 0.5188886862189795, 'corr_graph_eq': 0.9991169770243907, 'corr_graph_avg': 0.9333489495986153}
--------------------------------------
number of rankings available: 60
{'avg': 0.581746651946431, 'graph': 0.6176381137293365, 'eq': 0.6172203715259443, 'corr_graph_eq': 0.9992853637432219, 'corr_graph_avg': 0.9452567890450728}
--------------------------------------
number of rankings available: 70
{'avg': 0.5855959295348726, 'graph': 0.6105042738036655, 'eq': 0.6146790188643019, 'corr_graph_eq': 0.9988932925939572, 'corr_graph_avg': 0.9541415318270277}
--------------------------------------
number of rankings available: 80
{'avg': 0.594238371813949, 'graph': 0.6124775668114719, 'eq': 0.6090817453079249, 'corr_graph_eq': 0.9986162320008296, 'corr_graph_avg': 0.9516971462361637}
--------------------------------------
number of rankings available: 90
{'avg': 0.6528814861037776, 'graph': 0.6685824747911345, 'eq': 0.6644193347093552, 'corr_graph_eq': 0.998760811221577, 'corr_graph_avg': 0.9500023616398802}
--------------------------------------
number of rankings available: 100
{'avg': 0.7592097153554457, 'graph': 0.7748645320253476, 'eq': 0.778316325061201, 'corr_graph_eq': 0.9986045195300323, 'corr_graph_avg': 0.9689776319928975}
--------------------------------------
number of rankings available: 125
{'avg': 0.7592404358469735, 'graph': 0.7952108234565716, 'eq': 0.7979367349877597, 'corr_graph_eq': 0.9985849357294801, 'corr_graph_avg': 0.9670326505446638}
--------------------------------------
number of rankings available: 150
{'avg': 0.8177975327605241, 'graph': 0.8338240303172642, 'eq': 0.8357805404886478, 'corr_graph_eq': 0.9987750175805727, 'corr_graph_avg': 0.973077267208882}
--------------------------------------
number of rankings available: 175
{'avg': 0.8296986511784188, 'graph': 0.8322850589517754, 'eq': 0.8335678970863534, 'corr_graph_eq': 0.9980623020217055, 'corr_graph_avg': 0.9736928506789779}
--------------------------------------
number of rankings available: 200
{'avg': 0.8443116209859357, 'graph': 0.855238237870799, 'eq': 0.8576374021984351, 'corr_graph_eq': 0.9984816367927717, 'corr_graph_avg': 0.9769219544432244}
--------------------------------------
number of rankings available: 250
{'avg': 0.8281987231795707, 'graph': 0.8472580281915887, 'eq': 0.8489627034032543, 'corr_graph_eq': 0.9978931466511936, 'corr_graph_avg': 0.9700315327204603}
--------------------------------------
number of rankings available: 300
{'avg': 0.8552066433062929, 'graph': 0.8782567623628004, 'eq': 0.8796125378006048, 'corr_graph_eq': 0.9978668417756439, 'corr_graph_avg': 0.9775797582310694}
--------------------------------------
number of rankings available: 350
{'avg': 0.8873794460711371, 'graph': 0.8973040454295462, 'eq': 0.8992406278500454, 'corr_graph_eq': 0.9977282187792099, 'corr_graph_avg': 0.9777188029449393}
--------------------------------------
number of rankings available: 400
{'avg': 0.8717043152690442, 'graph': 0.8921848632542332, 'eq': 0.8941863389814236, 'corr_graph_eq': 0.9979787793938781, 'corr_graph_avg': 0.9776978360653875}
--------------------------------------
number of rankings available: 450
{'avg': 0.9056074497191954, 'graph': 0.9119969678836083, 'eq': 0.9122791724667596, 'corr_graph_eq': 0.9981012783409526, 'corr_graph_avg': 0.9862853888688462}
--------------------------------------
number of rankings available: 500
{'avg': 0.9154426150818412, 'graph': 0.9246062443715376, 'eq': 0.9244160706571304, 'corr_graph_eq': 0.9979830050181402, 'corr_graph_avg': 0.9819553350342176}
--------------------------------------
plt.figure(figsize=(20, 10))
plt.plot(nb_experts_available,
[results[nb_experts]['avg'] for nb_experts in nb_experts_available],
'--o', label='average')
plt.plot(nb_experts_available,
[results[nb_experts]['graph'] for nb_experts in nb_experts_available],
'--o', label='network-based')
plt.plot(nb_experts_available,
[results[nb_experts]['eq'] for nb_experts in nb_experts_available],
'--o', label='stat. distr.')
plt.title('Correlation to the true ranking', fontsize=24)
plt.legend(fontsize=24)
plt.xlabel('Number of rankings available', fontsize=16)
plt.ylabel('Correlation', fontsize=16)
plt.show()
plt.figure(figsize=(20, 10))
plt.plot(nb_experts_available,
[results[nb_experts]['corr_graph_avg'] for nb_experts in nb_experts_available],
'--o', label='corr(graph, average)')
plt.plot(nb_experts_available,
[results[nb_experts]['corr_graph_eq'] for nb_experts in nb_experts_available],
'--o', label='corr(graph, equation)')
plt.title('Correlation between methods', fontsize=24)
plt.legend(fontsize=24)
plt.xlabel('Number of rankings available', fontsize=16)
plt.ylabel('Correlation', fontsize=16)
plt.show()