Image("./InterTubes-Dataset/conduit_fiber_cable.png")
Evaluate state of current sharing model. What's known? What could it be better?
Develop new (near) optimal sharing matrix
Make recommendations (useful for Telecom and City planners)
Dataset is made available through PREDICT : https://www.predict.org/
Image("./InterTubes-Dataset/long-haul.jpg", width=750)
sharing_url = "https://s3-us-west-2.amazonaws.com/telecoms-analytics-dataset/sharing.txt"
shared_conduit = pd.read_csv(sharing_url, delim_whitespace=True, index_col=0, header=None)
shared_conduit.index.name = 'ISP Names'
# Initialize first column to ISP and set other columns to Sn => "Number of shared conduits"
col_names = []
for k in np.arange(1,shared_conduit.columns.size+1):
col_names.append('S'+str(k))
# Set the columns names and display
shared_conduit.columns = col_names
shared_conduit.head(3)
S1 | S2 | S3 | S4 | S5 | S6 | S7 | S8 | S9 | S10 | ... | S533 | S534 | S535 | S536 | S537 | S538 | S539 | S540 | S541 | S542 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ISP Names | |||||||||||||||||||||
ATT | 0 | 0 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Verizon | 0 | 0 | 0 | 10 | 0 | 18 | 11 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
DeutscheTelekom | 0 | 0 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 rows × 542 columns
shared_conduit.head(10)
Conduits | S1 | S2 | S3 | S4 | S5 | S6 | S7 | S8 | S9 | S10 | ... | S533 | S534 | S535 | S536 | S537 | S538 | S539 | S540 | S541 | S542 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ISP Names | |||||||||||||||||||||
ATT | 0 | 0 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Verizon | 0 | 0 | 0 | 10 | 0 | 18 | 11 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
DeutscheTelekom | 0 | 0 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
XO | 0 | 0 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
NTT | 0 | 12 | 9 | 10 | 0 | 18 | 11 | 0 | 9 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
TeliaSonera | 0 | 0 | 0 | 10 | 0 | 18 | 11 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Sprint | 0 | 0 | 0 | 10 | 3 | 18 | 11 | 0 | 9 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Tata | 0 | 12 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
CenturyLink | 0 | 12 | 9 | 0 | 0 | 18 | 0 | 0 | 9 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Cogent | 0 | 12 | 0 | 0 | 0 | 18 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10 rows × 542 columns
sum_across_conduit = shared_conduit.sum(axis=1)
sum_across_conduit.sort_values(ascending=False, inplace=True)
sum_across_conduit = pd.DataFrame(sum_across_conduit)
sum_across_conduit.columns = ['TOTAL CONDUIT']
sum_across_conduit.head()
TOTAL CONDUIT | |
---|---|
ISP Names | |
EarthLink | 2958 |
Level3 | 2914 |
Comcast | 1888 |
CenturyLink | 1843 |
TWC | 1773 |
f, ax = plt.subplots(1,1, figsize=(15,10));
sns.barplot(data=sum_across_conduit, y=sum_across_conduit.index, x='TOTAL CONDUIT', palette='BrBG_d')
plt.xlabel("TOTAL CONDUITS used by each ISP")
<matplotlib.text.Text at 0x126bd5198>
plt.figure(figsize=(25,10)); plt.ylim(-1,20); plt.ylabel("Number of Shared Conduits")
sns.boxplot(data=trans_shared_conduit, palette='BrBG', whis=20)
print("Considering the variability, whisker length, etc, we know distribution cannot be Normal.")
Considering the variability, whisker length, etc, we know distribution cannot be Normal.
Usage Patterns
of the top 8 ISP?¶trans_shared_conduit = shared_conduit.transpose().copy()
plt.figure(figsize=(15,7)); plt.xlim(-0.5,20)
top_x = sum_across_conduit.index.values[:8]
ax = sns.boxplot(data=trans_shared_conduit[top_x], orient='h', palette='Set1')
ax = sns.swarmplot(data=trans_shared_conduit[top_x], orient='h', palette='Set1')
print("Usage Pattern: (Earthlink & Level3) ::::: (Comcast & CenturyLink) ::::: (TWC & ATT & Verizon)")
Usage Pattern: (Earthlink & Level3) ::::: (Comcast & CenturyLink) ::::: (TWC & ATT & Verizon)
top10_ISP_2015 = ['Comcast', 'TWC', 'Verizon', 'ATT', 'Cox',
'Cogent', 'SuddenLink', 'CableOne', 'CenturyLink', 'Level3']
for isp in sum_across_conduit.index:
try:
if isp in top10_ISP_2015:
sum_across_conduit.loc[isp, 'TOP10_ISP_2015'] = 'TOP 10'
else:
sum_across_conduit.loc[isp, 'TOP10_ISP_2015'] = 'OTHERS'
except KeyError:
pass
sum_across_conduit.head()
TOTAL CONDUIT | TOP10_ISP_2015 | |
---|---|---|
ISP Names | ||
EarthLink | 2958 | OTHERS |
Level3 | 2914 | TOP 10 |
Comcast | 1888 | TOP 10 |
CenturyLink | 1843 | TOP 10 |
TWC | 1773 | TOP 10 |
f, axes = plt.subplots(1,2, figsize=(20, 7));
ax1 = sns.boxplot(data=sum_across_conduit, y='TOTAL CONDUIT', palette='Reds_d', ax=axes[0])
ax2 = sns.countplot(data=sum_across_conduit, x='TOP10_ISP_2015', palette='Set2', ax=axes[1])
ax1.set_xlabel('ALL ISPs'); ax2.set_xlabel('ALL ISPs'); ax2.set_ylabel('COUNT')
<matplotlib.text.Text at 0x12f116d30>
trans_shared_conduit[top_x].head()
ISP Names | EarthLink | Level3 | Comcast | CenturyLink | TWC | Verizon | ATT | NTT |
---|---|---|---|---|---|---|---|---|
Conduits | ||||||||
S1 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
S2 | 12 | 12 | 12 | 12 | 12 | 0 | 0 | 12 |
S3 | 9 | 9 | 9 | 9 | 9 | 0 | 0 | 9 |
S4 | 10 | 10 | 0 | 0 | 10 | 10 | 0 | 10 |
S5 | 3 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
ISP
share each conduit?¶conduit_counts = [len(shared_conduit[conduit].nonzero()[0]) for conduit in shared_conduit.columns]
df_conduit_counts = pd.DataFrame(conduit_counts, index=shared_conduit.columns)
df_conduit_counts.columns = ['ISP in Conduits']
df_conduit_counts.index.name = 'Conduits'
df = df_conduit_counts['ISP in Conduits'];
# Compute the 1st, 2nd and 3rd Quantiles
first, secnd, third = df.quantile([.25, .5, .75])
df_conduit_counts.head(3)
ISP in Conduits | |
---|---|
Conduits | |
S1 | 2 |
S2 | 12 |
S3 | 9 |
f, axes = plt.subplots(1,2, figsize=(20, 7)); df2 = df_conduit_counts.copy()
ax = sns.boxplot(x='ISP in Conduits', data=df2, palette='Reds_d', ax=axes[1], whis=19)
ax = sns.countplot(x='ISP in Conduits', data=df2, palette='Reds_d', ax=axes[0])
print("\nMin. : {} \t Max. : {} \t Median : {} \t Q1 : {} \t Q2 : {} \t Q3 : {} "
.format(df.min(), df.max(), np.median(df), first, secnd, third))
Min. : 1 Max. : 19 Median : 4.0 Q1 : 2.0 Q2 : 4.0 Q3 : 8.0
Location
(city;state
)¶# Extract dataset into dataframe and verify that there are 273 nodes
column_names = ['Nodes_City_State']
node_url = "https://s3-us-west-2.amazonaws.com/telecoms-analytics-dataset/nodes.txt"
node_city_state = pd.read_csv(node_url, sep='\t', header=None, index_col=0, names=column_names)
node_city_state.index.name = 'Location_Index'
print(node_city_state.shape)
node_city_state.head(3)
(273, 1)
Nodes_City_State | |
---|---|
Location_Index | |
1 | Abilene;Texas |
2 | Akron;Ohio |
3 | Albany;New_York |
Fiber Optics
conduit dataset¶column_names = ['Source', 'Sink']
# edges_url = "./InterTubes-Dataset/links.txt"
edges_url = "https://s3-us-west-2.amazonaws.com/telecoms-analytics-dataset/links.txt"
edges_src_snk = pd.read_csv(edges_url, names=column_names, header=None)
print("\nNumber of links (edges or conduit) : ", edges_src_snk.index.size)
print(edges_src_snk.shape)
edges_src_snk.head()
Number of links (edges or conduit) : 542 (542, 2)
Source | Sink | |
---|---|---|
0 | 105 | 21 |
1 | 105 | 223 |
2 | 105 | 230 |
3 | 223 | 21 |
4 | 223 | 111 |
Names
for Index
in both Source
and Sink
¶on_Source = pd.merge(edges_src_snk, node_city_state, left_on=['Source'], right_index=True)
on_Source.head(2)
Source | Sink | Nodes_City_State | |
---|---|---|---|
0 | 105 | 21 | Harbour_Pointe;Washington |
1 | 105 | 223 | Harbour_Pointe;Washington |
# Merge the Source, Sink with the corresponding codes in both dataframes
on_Source = pd.merge(edges_src_snk, node_city_state, left_on=['Source'], right_index=True)
source_sink = pd.merge(on_Source, node_city_state, left_on=['Sink'], right_index=True)
# Rename the resulting columns and sort the index 'inplace'
column_names = ['Source_Index', 'Sink_Index', 'Outgoing', 'Incoming']
source_sink.columns = column_names
source_sink.sort_index(inplace=True)
# Get the frequency of Incoming and Outgoing sites and merge into a single Dataframe
all_incoming_outgoing = pd.merge(pd.DataFrame(source_sink['Incoming'].value_counts()),
pd.DataFrame(source_sink['Outgoing'].value_counts()), left_index=True, right_index=True)
all_incoming_outgoing.head()
Incoming | Outgoing | |
---|---|---|
Denver;Colorado | 5 | 12 |
Salt_Lake_City;Utah | 2 | 11 |
Dallas;Texas | 4 | 11 |
Phoenix;Arizona | 4 | 8 |
Atlanta;Georgia | 7 | 8 |
LOCATIONS
with HIGHEST INCOMING and OUTGOING fiber Links?¶# Sort the dataframe on Outgoing connections
highest_incoming_loc = all_incoming_outgoing.sort_values('Incoming', ascending=False).head(5)
highest_outgoing_loc = all_incoming_outgoing.sort_values(['Outgoing'], ascending=False).head(5)
# highest_outgoing_loc = highest_outgoing_loc.reindex(columns=['Outgoing', 'Incoming'])
highest_outgoing_loc.head()
Incoming | Outgoing | |
---|---|---|
Denver;Colorado | 5 | 12 |
Dallas;Texas | 4 | 11 |
Salt_Lake_City;Utah | 2 | 11 |
Phoenix;Arizona | 4 | 8 |
Atlanta;Georgia | 7 | 8 |
f, axes = plt.subplots(1,2, figsize=(20, 7));
highest_incoming_loc.plot(kind='bar', stacked=True, ax=axes[0], title='CITIES with HIGHEST INCOMING')
highest_outgoing_loc.plot(kind='bar', stacked=True, ax=axes[1], title='CITIES with HIGHEST OUTGOING')
<matplotlib.axes._subplots.AxesSubplot at 0x110cbfe48>
LOCATIONS
with LOWEST INCOMING & OUTGOING Connections?¶lowest_outgoing_loc = all_incoming_outgoing.sort_values(['Outgoing']).head(5)
lowest_incoming_loc = all_incoming_outgoing.sort_values('Incoming').head(5)
# lowest_outgoing_loc = lowest_outgoing_loc.reindex(columns=['Outgoing', 'Incoming'])
lowest_outgoing_loc.head()
Incoming | Outgoing | |
---|---|---|
Lansing;Michigan | 2 | 1 |
Fredericksburg;Virginia | 1 | 1 |
Gasontia;North_Carolina | 1 | 1 |
Melbourne;Florida | 2 | 1 |
Gainesville;Florida | 2 | 1 |
f, axes = plt.subplots(1,2, figsize=(20, 7));
lowest_incoming_loc.plot(kind='bar', stacked=True, ax=axes[0])
lowest_outgoing_loc.plot(kind='bar', stacked=True, ax=axes[1])
# title='Top 5 Locations with LOWEST INCOMING Fiber-optics Connections'
<matplotlib.axes._subplots.AxesSubplot at 0x12d2eb278>
States
have multiple
installations in the associated Cities
?¶Cities
and States
are not explicitly separated in our data. But we can quickly check if multiple States
appear in the Location
names.
source_sink.groupby(['Incoming']).size().head(7)
# We see here that not only does each city have multiple fiber locations, but some states also have
# installations in multiple cities. We need to separate the locations by Cities and States.
Incoming Abilene;Texas 1 Akron;Ohio 3 Albany;New_York 3 Albuquerque;New_Mexico 3 Allentown;Pennsylvania 2 Altoona;Pennsylvania 1 Amarillo;Texas 2 dtype: int64
Cities
and States
into Columns¶source_sink['Incoming_City'] = ''; source_sink['Outgoing_City'] = '';
source_sink['Incoming_State'] = ''; source_sink['Outgoing_State'] = '';
for ndx in source_sink.index:
# Split both incoming and outgoing on colon to City and State
incoming = source_sink.Incoming[ndx].split(';')
outgoing = source_sink.Outgoing[ndx].split(';')
# Populate the Incoming Cities
source_sink['Incoming_City'].values[ndx] = incoming[0]
source_sink['Outgoing_City'].values[ndx] = outgoing[0]
# If the State is mising, use the City e.g. Washington DC
source_sink['Incoming_State'].values[ndx] = incoming[1] if len(incoming) > 1 else incoming[0]
source_sink['Outgoing_State'].values[ndx] = outgoing[1] if len(outgoing) > 1 else outgoing[0]
# Form new dataframe
new_source_sink = source_sink.drop(['Source_Index', 'Sink_Index', 'Incoming', 'Outgoing'], axis=1)
new_source_sink.head()
Incoming_City | Outgoing_City | Incoming_State | Outgoing_State | |
---|---|---|---|---|
0 | Bellevue | Harbour_Pointe | Washington | Washington |
1 | Seattle | Harbour_Pointe | Washington | Washington |
2 | Spokane | Harbour_Pointe | Washington | Washington |
3 | Bellevue | Seattle | Washington | Washington |
4 | Hillsboro | Seattle | Oregon | Washington |
Utilization Difference
for the CITIES
?¶city_in_out_diff = pd.merge(pd.DataFrame(new_source_sink['Incoming_City'].value_counts()),
pd.DataFrame(new_source_sink['Outgoing_City'].value_counts()),
left_index=True, right_index=True)
city_in_out_diff['City_Difference'] = ((city_in_out_diff.Incoming_City -
city_in_out_diff.Outgoing_City)/city_in_out_diff.Incoming_City) * 100
city_in_out_diff = city_in_out_diff.sort_values('City_Difference', ascending=False)
top5_low5 = [city_in_out_diff.head(), city_in_out_diff.tail()]
top5_low5 = pd.concat(top5_low5)
top5_low5
Incoming_City | Outgoing_City | City_Difference | |
---|---|---|---|
Orlando | 8 | 1 | 87.5 |
Troy | 5 | 1 | 80.0 |
San_Francisco | 5 | 1 | 80.0 |
Wilmington | 4 | 1 | 75.0 |
Lubbock | 4 | 1 | 75.0 |
Des_Moines | 1 | 5 | -400.0 |
Orange_County | 1 | 5 | -400.0 |
San_Diego | 1 | 5 | -400.0 |
Salt_Lake_City | 2 | 11 | -450.0 |
Minneapolis | 1 | 7 | -600.0 |
ax = plt.figure(figsize=(13,7))
sns.barplot(x='City_Difference', y=top5_low5.index.values, data=top5_low5, label='City_Difference')
plt.xlabel('Average Link Utilization Difference (%)')
plt.title('Incoming and Outgoing Links DIFFERENCES (in %) for few locations')
<matplotlib.text.Text at 0x127bb5390>
Utilization Difference
for the STATES
?¶state_in_out_diff = pd.merge(pd.DataFrame(new_source_sink['Incoming_State'].value_counts()),
pd.DataFrame(new_source_sink['Outgoing_State'].value_counts()),
left_index=True, right_index=True)
state_in_out_diff['State_Difference'] = ((state_in_out_diff.Incoming_State -
state_in_out_diff.Outgoing_State)/state_in_out_diff.Incoming_State) * 100
state_in_out_diff = state_in_out_diff.sort_values('State_Difference', ascending=False)
print("Total States: ", state_in_out_diff.index.size)
state_in_out_diff;
Total States: 42
ax = plt.figure(figsize=(20,16)); df = state_in_out_diff;
sns.barplot(x='State_Difference', y=df.index[:38], data=df.head(38), palette='GnBu_d')
plt.xlabel('Average Link Utilization Difference by STATE(%)')
plt.title('Incoming and Outgoing Link DIFFERENCES (in %) for different States')
<matplotlib.text.Text at 0x1327a14a8>
CITIES
¶city_in_out_diff['City_Difference'] = city_in_out_diff.Incoming_City - city_in_out_diff.Outgoing_City
px = sns.pairplot(city_in_out_diff, diag_kind="kde", kind="reg", diag_kws=dict(shade=True), size=4)
px.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/city_corr_diff")
p1 = sns.jointplot("Incoming_City", "Outgoing_City", data=city_in_out_diff, color='darkorange', kind='reg', size=7)
p2 = sns.jointplot("Incoming_City","City_Difference", data=city_in_out_diff, color='darkred', kind='reg', size=7)
p3 = sns.jointplot("Outgoing_City","City_Difference", data=city_in_out_diff, color='darkblue', kind='kde', size=7)
p1.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_out_vs_in")
p2.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_in_vs_diff")
p3.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_out_vs_diff")
STATES
¶state_in_out_diff['State_Difference'] = state_in_out_diff.Incoming_State - state_in_out_diff.Outgoing_State
state_in_out_diff = state_in_out_diff.sort_values('State_Difference', ascending=True)
# Plot regression line and a matrix of correlation plots for the data
pw = sns.pairplot(state_in_out_diff, diag_kind="kde", kind="reg", diag_kws=dict(shade=True), size=4)
pw.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_corr_diff")
Pearson r
is a measure of the linear correlation or dependency between two variables. When Linear equation describes the relationship between X and Y perfectly, Pearson is = 1.
p1 = sns.jointplot("Incoming_State", "Outgoing_State", data=state_in_out_diff, kind='reg', color='darkblue', size=7)
p2 = sns.jointplot("Incoming_State","State_Difference", data=state_in_out_diff, kind='reg', color='darkred', size=7)
p3 = sns.jointplot("Outgoing_State","State_Difference", data=state_in_out_diff, kind='reg', color='darkgreen', size=7)
p1.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_out_vs_in")
p2.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_in_vs_diff")
p3.savefig("/Users/RichardAfolabi/myGitHub/turaquo/static/img/state_out_vs_diff")
We need to encode the US States
using Postal Code
before we can overlay on an interactive map. PostalCode is not provided in data.
http://www.infoplease.com/ipa/A0110468.html contains list of US States
and relevant PostalCode
. We go ahead and scrape
the page. Clean
the scraped data and merge
with our existing dataframe.
# Scrape web data for US States Abbrevations and Postal Codes. http://www.infoplease.com/ipa/A0110468.html
def postalcode_scraper():
""" Scrapes a known URL and returns a Pandas dataframe containing
list of US States as index and their Postal Codes as column values.
"""
# URL to scrape
data_url = "http://www.infoplease.com/ipa/A0110468.html"
# Scrape the page with Pandas
table_scrape = pd.read_html(data_url)[1]
# Subtitute spaces in State names with '_' to confirm with existing dataset
table_scrape[0] = [tab.replace(' ','_') for tab in table_scrape[0] ]
# Reindex using corrected state names to confirm to existing dataframe formats
table_scrape.index = table_scrape[0]
# Remove extranous data elements and unneeded columns
table_scrape.drop([0,1], axis=1, inplace=True)
table_scrape.drop('State', inplace=True)
# Set column and index names.
table_scrape.index.name = 'State'
table_scrape.columns = ['PostalCode']
return table_scrape
postalcode_table = postalcode_scraper()
postalcode_table.head()
PostalCode | |
---|---|
State | |
Alabama | AL |
Alaska | AK |
American_Samoa | AS |
Arizona | AZ |
Arkansas | AR |
state_map = state_in_out_diff[['Incoming_State', 'Outgoing_State']].copy()
state_map['State_Difference'] = state_map.Incoming_State - state_map.Outgoing_State
state_map = pd.merge(state_map, postalcode_table, left_index=True, right_index=True, how='left')
# check those state with missing Postal code and encode them manually.
# print(state_map[state_map.PostalCode.isnull()])
# Since Washington DC has no state, we can directly encode as 'DC
state_map.ix['Washington_DC'] = state_map.ix['Washington_DC'].fillna('DC')
# # Then check again...
# print(state_map.ix['Washington_DC'])
state_map['US_States'] = state_map.index
state_map.index = state_map.PostalCode
state_map.drop('PostalCode', axis=1, inplace=True)
state_map.head(3)
Incoming_State | Outgoing_State | State_Difference | US_States | |
---|---|---|---|---|
PostalCode | ||||
UT | 5 | 13 | -8 | Utah |
WA | 8 | 15 | -7 | Washington |
AL | 1 | 6 | -5 | Alabama |
def find_states_missing(df):
""" Returns dataframe of states missing from our dataset """
states_missing = []
postcodes = postalcode_scraper()
for st in postcodes.PostalCode:
if st not in df.index:
d = postcodes.loc[postcodes['PostalCode'] == st]
states_missing.append(d)
states_missing = pd.concat(states_missing)
states_missing['US_States'] = states_missing.index
states_missing.index = states_missing.PostalCode.values
states_missing.drop('PostalCode', axis=1, inplace=True)
return states_missing
# Call the functions and create relevant dataset
missing_states = find_states_missing(state_map)
missing_states.head()
US_States | |
---|---|
AK | Alaska |
AS | American_Samoa |
DE | Delaware |
GU | Guam |
HI | Hawaii |
overlay
the exploratory data on the US Maps¶import jinja2
from collections import OrderedDict
from bokeh.palettes import brewer
from bokeh.models import HoverTool
from bokeh.embed import components
from bokeh.plotting import figure, ColumnDataSource
from bokeh.io import output_notebook, hplot, vplot, show
from bokeh.charts import BoxPlot, Histogram, Bar, vplot, hplot
from bokeh.sampledata import us_states, us_cities
output_notebook()
def assign_color(code):
""" Assign specific color contrast to state based on link difference. """
link_diff = state_map.State_Difference.ix[code]
if link_diff < 0: # Outgoing more than incoming links
colr = "#ED8466" # Negative => Light Red
else: # incoming link > outgoing, Link diff > 0.
colr = colors[min(link_diff, len(colors))] # Assign one of the color contrasts
return colr
state_names = []
state_colors = []
state_incoming_link = []
state_outgoing_link = []
usa_states = us_states.data.copy()
del usa_states['AK'];
del usa_states['HI'];
state_map = state_map.sort_values('State_Difference', ascending=False)
missing_states = find_states_missing(state_map)
us_state_xs = [usa_states[code]["lons"] for code in usa_states]
us_state_ys = [usa_states[code]["lats"] for code in usa_states]
colors = ["#D1E2F2", "#ADCCE5", "#77B0D4", "#448BC0", "#449CC0",
"#2B62B2","#2264AB", "#0D408E", "#294F70", "#273A48"]
# Iterate over US Long/Lat list.
for code in usa_states:
# If code exist for infrastructure state
if code in state_map.index.values:
state_colors.append(assign_color(code))
state_names.append(state_map.US_States.ix[code])
state_incoming_link.append(state_map.Incoming_State.ix[code])
state_outgoing_link.append(state_map.Outgoing_State.ix[code])
else:
# No link in State
state_colors.append('#979383') # Shade of gray
state_names.append(missing_states.US_States.ix[code])
state_incoming_link.append(0)
state_outgoing_link.append(0)
source = ColumnDataSource(
data = dict(
x=us_state_xs,
y=us_state_ys,
color=state_colors,
name=state_names,
incoming=state_incoming_link,
outgoing=state_outgoing_link
))
TOOLS="pan,wheel_zoom,box_zoom,reset,hover,save"
plot = figure(tools=TOOLS, plot_width=900, plot_height=600, toolbar_location='left')
plot.title=('Fiber-Optics Coverage within United States')
plot.patches('x', 'y', fill_color='color', line_color="#333333", line_width=0.5, source=source)
# Configure the tooltips
hover = plot.select(dict(type=HoverTool))
hover.point_policy = "follow_mouse"
hover.tooltips = OrderedDict([
("Name ", "@name"),
("Incoming Fiber Links ", " @incoming"),
("Outgoing Fiber Links ", " @outgoing")
])
# show(plot)
show(plot)
# Image("../turaquo/static/img/fiber-optix-links.png")
<bokeh.io._CommsHandle at 0x10c919240>
Bokeh
, Flask
& Jinja2
¶# Create skeletal jinja2 HTML template
template = jinja2.Template("""
<!DOCTYPE html>
<html lang="en-US">
<link
href="http://cdn.pydata.org/bokeh/release/bokeh-0.9.0.min.css"
rel="stylesheet" type="text/css"
>
<script
src="http://cdn.pydata.org/bokeh/release/bokeh-0.9.0.min.js"
></script>
<body>
<h1>Bokeh Chart!</h1>
{{ script }}
{{ div }}
</body>
</html>
""")
TOOLS = "resize,pan,wheel_zoom,box_zoom,reset,previewsave"
mycolrs = brewer["Spectral"][10]
bar = Bar(top5_low5, top5_low5.index, values='City_Difference', agg='mean', width=700, tools=TOOLS, palette=mycolrs )
script, div = components(bar)
show(bar)
# HTML(template.render(script=script, div=div))
<bokeh.io._CommsHandle at 0x140ed97b8>
BOKEH + FLASK
WEB APPLICATION¶