import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

!git clone https://github.com/weike2001/ds

fatal: destination path 'ds' already exists and is not an empty directory.

import pandas as pd

# Set the paths to the Excel files in the cloned repository
file_path_2022 = '/content/ds/data/2022_QS_World_University_Rankings_Results_public_version.xlsx'
file_path_2023 = '/content/ds/data/2023 QS World University Rankings V2.1 (For qs.com).xlsx'
file_path_2024 = '/content/ds/data/2024 QS World University Rankings 1.2 (For qs.com).xlsx'

# Read the data into pandas DataFrames
df_2022 = pd.read_excel(file_path_2022)
df_2023 = pd.read_excel(file_path_2023)
df_2024 = pd.read_excel(file_path_2024)

# Assuming you want to save these DataFrames as CSV files in the same directory
csv_file_path_2022 = file_path_2022.replace('.xlsx', '.csv')
csv_file_path_2023 = file_path_2023.replace('.xlsx', '.csv')
csv_file_path_2024 = file_path_2024.replace('.xlsx', '.csv')

# Save the DataFrames as CSV files
df_2022.to_csv(csv_file_path_2022, index=False)
df_2023.to_csv(csv_file_path_2023, index=False)
df_2024.to_csv(csv_file_path_2024, index=False)

import pandas as pd

# Define the new specific column names
specific_column_names_2022 = [
    'National Rank', 'Regional Rank', '2022 Rank', '2021 Rank', 'Institution Name',
    'Location Code', 'Country/Territory', 'Size', 'Focus', 'Research Intensity',
    'Age Band', 'Status', 'Academic Reputation Score', 'Academic Reputation Rank',
    'Employer Reputation Score', 'Employer Reputation Rank', 'Faculty Student Score',
    'Faculty Student Rank', 'Citations per Faculty Score', 'Citations per Faculty Rank',
    'International Faculty Score', 'International Faculty Rank', 'International Students Score',
    'International Students Rank', 'Overall Score'
]

specific_column_names_2023 = [
    '2023 Rank', '2022 Rank', 'Institution Name', 'Location Code', 'Country/Territory',
    'Size', 'Focus', 'Research Intensity', 'Age Band', 'Status',
    'Academic Reputation Score', 'Academic Reputation Rank',
    'Employer Reputation Score', 'Employer Reputation Rank',
    'Faculty Student Score', 'Faculty Student Rank',
    'Citations per Faculty Score', 'Citations per Faculty Rank',
    'International Faculty Score', 'International Faculty Rank',
    'International Students Score', 'International Students Rank',
    'International Research Network Score', 'International Research Network Rank',
    'Employment Outcomes Score', 'Employment Outcomes Rank',
    'Overall Score'
]

specific_column_names_2024 = [
    '2024 Rank', '2023 Rank', 'Institution Name', 'Location Code', 'Country/Territory',
    'Size', 'Focus', 'Research Intensity', 'Status',
    'Academic Reputation Score', 'Academic Reputation Rank',
    'Employer Reputation Score', 'Employer Reputation Rank',
    'Faculty Student Score', 'Faculty Student Rank',
    'Citations per Faculty Score', 'Citations per Faculty Rank',
    'International Faculty Score', 'International Faculty Rank',
    'International Students Score', 'International Students Rank',
    'International Research Network Score', 'International Research Network Rank',
    'Employment Outcomes Score', 'Employment Outcomes Rank',
    'Sustainability Score', 'Sustainability Rank',
    'Overall Score'
]

print(len(specific_column_names_2024))
# Reading the CSV files into Pandas DataFrames
df_2022 = pd.read_csv(csv_file_path_2022, skiprows = 4, names=specific_column_names_2022)
df_2023 = pd.read_csv(csv_file_path_2023, skiprows = 4, names=specific_column_names_2023)
df_2024 = pd.read_csv(csv_file_path_2024, skiprows = 4, names=specific_column_names_2024)

df_2022.head()

28

df_2023.head()

df_2024.head()

import pandas as pd
import numpy as np

# Replace hyphens with NaN and convert the column to numeric
df_2022['Overall Score'] = pd.to_numeric(df_2022['Overall Score'].replace('-', np.nan), errors='coerce')
df_2023['Overall Score'] = pd.to_numeric(df_2023['Overall Score'].replace('-', np.nan), errors='coerce')
df_2024['Overall Score'] = pd.to_numeric(df_2024['Overall Score'].replace('-', np.nan), errors='coerce')

# Now, 'Overall Score' will be a float column with NaNs where there were hyphens - .

import pandas as pd

df_2022.describe()

df_2023.describe()

df_2024.describe()

import matplotlib.pyplot as plt
import seaborn as sns

qs_metrics_weights = {
    'Academic Reputation Score': {"weight": 0.40},
    'Employer Reputation Score': {"weight": 0.10},
    'Faculty Student Score': {"weight": 0.20},
    'Citations per Faculty Score': {"weight": 0.20},
    'International Faculty Score': {"weight": 0.05},
    'International Students Score': {"weight": 0.05},
}

def create_grid_layout_without_definitions(df, metrics_info, year):
    # Set up the figure with subplots
    fig, axes = plt.subplots(2, 3, figsize=(20, 10))  # Adjust figure size as needed
    axes = axes.ravel()
    palette = sns.color_palette("coolwarm", len(metrics_info))

    # Plot each metric in the grid
    for ax, (metric, info), color in zip(axes, metrics_info.items(), palette):
        weight = info['weight']
        sns.histplot(df[metric], kde=True, ax=ax, color=color, alpha=0.7, linewidth=0.5)
        ax.set_title(f"{metric} ({weight*100}%)", fontsize=10)
        ax.set_xlabel('Score', fontsize=9)

    # Add a main title and adjust layout
    plt.suptitle(f'Distribution of QS Ranking Metrics for {year}', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the layout
    plt.show()

# Example usage with the 2022 dataset
create_grid_layout_without_definitions(df_2022, qs_metrics_weights, '2022')
create_grid_layout_without_definitions(df_2023, qs_metrics_weights, '2023')
create_grid_layout_without_definitions(df_2024, qs_metrics_weights, '2024')

import pandas as pd
import plotly.graph_objects as go

# Merge the dataframes on 'Institution Name'
df_merged = pd.merge(pd.merge(df_2022[['Institution Name', '2022 Rank']],
                              df_2023[['Institution Name', '2023 Rank']],
                              on='Institution Name'),
                     df_2024[['Institution Name', '2024 Rank']],
                     on='Institution Name')

# Convert '2024 Rank' to numeric for sorting
df_2024['2024 Rank'] = pd.to_numeric(df_2024['2024 Rank'], errors='coerce')

# Get the top 10 universities based on their 2024 rank
top_10_universities_2024 = df_2024.nsmallest(10, '2024 Rank')['Institution Name'].tolist()

# Filter df_merged to only include the top 10 universities of 2024
df_merged_top_10 = df_merged[df_merged['Institution Name'].isin(top_10_universities_2024)]

# Visualization
fig = go.Figure()

for uni in top_10_universities_2024:
    uni_data = df_merged_top_10[df_merged_top_10['Institution Name'] == uni]
    fig.add_trace(go.Scatter(x=['2022', '2023', '2024'],
                             y=[uni_data['2022 Rank'].values[0], uni_data['2023 Rank'].values[0], uni_data['2024 Rank'].values[0]],
                             mode='lines+markers',
                             name=uni))

fig.update_layout(title='Ranking Trends for Top 10 Universities in 2024',
                  xaxis_title='Year',
                  yaxis_title='Rank',
                  yaxis_autorange='reversed')  # Higher ranks (lower numbers) appear at the top

fig.show()

import plotly.express as px

# Assuming df_2024 is your DataFrame and it has been preprocessed correctly
fig = px.scatter(df_2024.dropna(subset=['Overall Score', 'Academic Reputation Score']),
                 x='Academic Reputation Score',
                 y='Overall Score',
                 hover_name='Institution Name',
                 color='Country/Territory',  # Using 'Country/Territory' for coloring
                 title='Overall Score vs. Academic Reputation Score by Country')

fig.show()

import plotly.express as px
import numpy as np

# Assuming 'Citations per Faculty Score' is used for the marker size,
# replace NaN values in this column with a default size, e.g., the median size of the non-NaN values
default_size = df_2024['Citations per Faculty Score'].median()
df_2024['Citations per Faculty Score for Size'] = df_2024['Citations per Faculty Score'].fillna(default_size)

fig = px.scatter(df_2024.dropna(subset=['Citations per Faculty Score', 'Country/Territory']),
                 x='Country/Territory',
                 y='Citations per Faculty Score',
                 size='Citations per Faculty Score for Size',  # Use the new column with no NaNs for size
                 hover_name='Institution Name',
                 color='Citations per Faculty Score',
                 title='Citations per Faculty Score by Country')

fig.update_layout(xaxis_title="Country",
                  yaxis_title="Citations per Faculty Score")

fig.show()

fig = px.scatter(df_2024.dropna(subset=['Faculty Student Score', 'Overall Score']),
                 x='Faculty Student Score',
                 y='Overall Score',
                 size='Citations per Faculty Score',  # This could indicate research strength
                 hover_name='Institution Name',
                 color='Country/Territory',
                 title='Faculty-Student Ratio vs. Overall Score')

fig.update_layout(xaxis_title="Faculty-Student Score",
                  yaxis_title="Overall Score")

fig.show()

fig = px.bar(df_2024.dropna(subset=['International Students Score']),
             x='Country/Territory',
             y='International Students Score',
             color='International Students Score',
             hover_name='Institution Name',
             title='International Students Score Across Different Countries')

fig.update_layout(xaxis_title="Country",
                  yaxis_title="International Students Score",
                  xaxis={'categoryorder':'total descending'})

fig.show()

# Calculate average overall score by country
average_scores = df_2024.groupby('Country/Territory')['Overall Score'].mean().sort_values(ascending=False).head(10).reset_index()

fig = px.bar(average_scores,
             x='Country/Territory',
             y='Overall Score',
             color='Overall Score',
             title='Top 10 Countries by Average Overall Score in QS Rankings')

fig.update_layout(xaxis_title="Country",
                  yaxis_title="Average Overall Score")

fig.show()

# Calculate the correlation matrix
corr = df_2022[['Academic Reputation Score', 'Employer Reputation Score',
                'Faculty Student Score', 'Citations per Faculty Score',
                'International Faculty Score', 'International Students Score',
                'Overall Score']].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title('Correlation Heatmap of QS Ranking Metrics')
plt.show()

import pandas as pd
import plotly.express as px
import plotly.io as pio

# Set default renderer to 'notebook_connected' which works well with nbconvert
pio.renderers.default = 'notebook_connected'

def create_choropleth_map(dataframe, column_name, title):

    # Generate a dictionary of value counts for the specified column
    sample_data = dataframe[column_name].value_counts().to_dict()

    # Convert the dictionary into a DataFrame
    df_counts = pd.DataFrame(list(sample_data.items()), columns=['Country', 'University_Count'])
    #print(df_counts)
    # Create the choropleth map
    fig = px.choropleth(df_counts,
                        locations="Country",
                        locationmode='country names',
                        color="University_Count",
                        color_continuous_scale=px.colors.sequential.Reds,  # Reds color scale
                        title=title)

    # Update the layout
    fig.update_layout(
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='equirectangular'
        )
    )
    # Show the figure
    fig.show()

# Use the function with your DataFrame and column
create_choropleth_map(df_2022, 'Country/Territory', 'Number of Universities per Country in 2022')
create_choropleth_map(df_2023, 'Country/Territory', 'Number of Universities per Country in 2023')
create_choropleth_map(df_2024, 'Country/Territory', 'Number of Universities per Country in 2024')

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df_2022, df_2023, df_2024 have been loaded and cleaned

# Define the region of interest, for example, 'North America'
region_of_interest = 'US'

# Filter the datasets to include only universities from the specified region
df_region_2022 = df_2022[df_2022['Location Code'] == region_of_interest]
df_region_2023 = df_2023[df_2023['Location Code'] == region_of_interest]
df_region_2024 = df_2024[df_2024['Location Code'] == region_of_interest]

def plot_region_universities(df, year):
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle(f'{region_of_interest} Universities Internationalization Impact in {year}', fontsize=16)

    # Plot International Faculty Score vs Overall Score
    sns.scatterplot(ax=axes[0], x='International Faculty Score', y='Overall Score', data=df)
    axes[0].set_title('International Faculty Score vs Overall Score')
    axes[0].invert_yaxis()  # Higher rankings should appear at the top
    axes[0].set_xlabel('International Faculty Score')
    axes[0].set_ylabel('Overall Score')

    # Plot International Students Score vs Overall Score
    sns.scatterplot(ax=axes[1], x='International Students Score', y='Overall Score', data=df)
    axes[1].set_title('International Students Score vs Overall Score')
    axes[1].invert_yaxis()  # Higher rankings should appear at the top
    axes[1].set_xlabel('International Students Score')
    axes[1].set_ylabel('Overall Score')

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

# Plotting for the selected region across the years
plot_region_universities(df_region_2022, '2022')
plot_region_universities(df_region_2023, '2023')
plot_region_universities(df_region_2024, '2024')

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Convert 'National Rank' to numeric; use 'coerce' to handle any conversion errors
df_2024['National Rank'] = pd.to_numeric(df_2024['2024 Rank'], errors='coerce')

# Create a binary target variable where 1 indicates ranking in the top 500, and 0 otherwise
df_2024['Top_500'] = (df_2024['2024 Rank'] <= 500).astype(int)

# Select features
features = ['Academic Reputation Score', 'Employer Reputation Score',
            'Faculty Student Score', 'Citations per Faculty Score',
            'International Faculty Score', 'International Students Score']

# Drop rows with NaNs in the features or target
df_2024 = df_2024.dropna(subset=features + ['Top_500'])

X = df_2024[features]
y = df_2024['Top_500']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
log_reg = LogisticRegression()

# Fit the model
log_reg.fit(X_train, y_train)

# Predict on the testing set
y_pred = log_reg.predict(X_test)

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plotting using seaborn
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

              precision    recall  f1-score   support

           0       0.91      0.96      0.94       237
           1       0.62      0.41      0.49        37

    accuracy                           0.89       274
   macro avg       0.77      0.68      0.71       274
weighted avg       0.87      0.89      0.88       274

[[228   9]
 [ 22  15]]

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Convert ranking columns to numeric and create a binary target variable for the top 500 ranking
df_2022['2022 Rank'] = pd.to_numeric(df_2022['2022 Rank'], errors='coerce')
df_2023['2023 Rank'] = pd.to_numeric(df_2023['2023 Rank'], errors='coerce')
df_2024['2024 Rank'] = pd.to_numeric(df_2024['2024 Rank'], errors='coerce')

df_2022['Top_500'] = (df_2022['2022 Rank'] <= 500).astype(int)
df_2023['Top_500'] = (df_2023['2023 Rank'] <= 500).astype(int)

# Concatenate the 2022 and 2023 data for training
features = ['Academic Reputation Score', 'Employer Reputation Score', 'Faculty Student Score',
            'Citations per Faculty Score', 'International Faculty Score', 'International Students Score']
X_train = pd.concat([df_2022[features], df_2023[features]], ignore_index=True)
y_train = pd.concat([df_2022['Top_500'], df_2023['Top_500']], ignore_index=True)

X_test = df_2024[features]
y_test = (df_2024['2024 Rank'] <= 500).astype(int)

# Define the base models for the voting classifier
log_clf = LogisticRegression(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
svc_clf = SVC(probability=True, random_state=42)

# Create a voting classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rf_clf), ('svc', svc_clf)],
    voting='soft'
)

# Create a pipeline with preprocessing and the voting classifier
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('classifier', voting_clf)
])

# Set up the grid search for hyperparameter tuning
param_grid = {
    'classifier__lr__C': [0.1, 1, 10],
    'classifier__rf__n_estimators': [50, 100, 200],
    'classifier__svc__C': [0.1, 1, 10]
}

# Execute grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Evaluate on the test set
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

Best parameters: {'classifier__lr__C': 0.1, 'classifier__rf__n_estimators': 50, 'classifier__svc__C': 0.1}
Best score: 0.87141594711279
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      1171
           1       0.63      0.43      0.51       199

    accuracy                           0.88      1370
   macro avg       0.77      0.69      0.72      1370
weighted avg       0.87      0.88      0.87      1370

	National Rank	Regional Rank	2022 Rank	2021 Rank	Institution Name	Location Code	Country/Territory	Size	Focus	Research Intensity	...	Employer Reputation Rank	Faculty Student Score	Faculty Student Rank	Citations per Faculty Score	Citations per Faculty Rank	International Faculty Score	International Faculty Rank	International Students Score	International Students Rank	Overall Score
0	1	1	1	1	Massachusetts Institute of Technology (MIT)	US	United States	M	CO	VH	...	4	100.0	12	100.0	6	100.0	45	91.4	105	100
1	1	1	2	5	University of Oxford	UK	United Kingdom	L	FC	VH	...	3	100.0	5	96.0	34	99.5	83	98.5	52	99.5
2	2	2	3=	2	Stanford University	US	United States	L	FC	VH	...	5	100.0	9	99.9	10	99.8	73	67.0	208	98.7
3	2	2	3=	7	University of Cambridge	UK	United Kingdom	L	FC	VH	...	2	100.0	10	92.1	48	100.0	57	97.7	64	98.7
4	3	3	5	3	Harvard University	US	United States	L	FC	VH	...	1	99.1	37	100.0	3	84.2	188	70.1	196	98

	2023 Rank	2022 Rank	Institution Name	Location Code	Country/Territory	Size	Focus	Research Intensity	Age Band	Status	...	Citations per Faculty Rank	International Faculty Score	International Faculty Rank	International Students Score	International Students Rank	International Research Network Score	International Research Network Rank	Employment Outcomes Score	Employment Outcomes Rank	Overall Score
0	1	1	Massachusetts Institute of Technology (MIT)	US	United States	M	CO	VH	5.0	B	...	5	100.0	54	90.0	109	96.1	58	100.0	3	100
1	2	3=	University of Cambridge	UK	United Kingdom	L	FC	VH	5.0	A	...	55	100.0	60	96.3	70	99.5	6	100.0	9	98.8
2	3	3=	Stanford University	US	United States	L	FC	VH	5.0	B	...	9	99.8	74	60.3	235	96.3	55	100.0	2	98.5
3	4	2	University of Oxford	UK	United Kingdom	L	FC	VH	5.0	A	...	64	98.8	101	98.4	54	99.9	3	100.0	7	98.4
4	5	5	Harvard University	US	United States	L	FC	VH	5.0	B	...	2	76.9	228	66.9	212	100.0	1	100.0	1	97.6

	2024 Rank	2023 Rank	Institution Name	Location Code	Country/Territory	Size	Focus	Research Intensity	Status	Academic Reputation Score	...	International Faculty Rank	International Students Score	International Students Rank	International Research Network Score	International Research Network Rank	Employment Outcomes Score	Employment Outcomes Rank	Sustainability Score	Sustainability Rank	Overall Score
0	1	1	Massachusetts Institute of Technology (MIT)	US	United States	M	CO	VH	B	100.0	...	56	88.2	128	94.3	58	100.0	4	95.2	51	100
1	2	2	University of Cambridge	UK	United Kingdom	L	FC	VH	A	100.0	...	64	95.8	85	99.9	7	100.0	6	97.3	33=	99.2
2	3	4	University of Oxford	UK	United Kingdom	L	FC	VH	A	100.0	...	110	98.2	60	100.0	1	100.0	3	97.8	26=	98.9
3	4	5	Harvard University	US	United States	L	FC	VH	B	100.0	...	210	66.8	223	100.0	5	100.0	1	96.7	39	98.3
4	5	3	Stanford University	US	United States	L	FC	VH	B	100.0	...	78	51.2	284	95.8	44	100.0	2	94.4	63	98.1

	Age Band	Academic Reputation Score	Employer Reputation Score	Faculty Student Score	Citations per Faculty Score	International Faculty Score	International Students Score	Overall Score
count	1300.000000	1300.000000	1300.000000	1299.000000	1300.000000	1228.000000	1275.000000	501.000000
mean	4.011538	21.552462	22.193000	31.907313	26.293308	26.503746	28.119059	44.767066
std	0.988318	23.315627	24.535947	28.564402	28.299027	35.429502	31.211629	18.961269
min	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	24.100000
25%	3.000000	6.200000	5.100000	9.400000	3.400000	1.700000	3.750000	29.600000
50%	4.000000	11.900000	11.950000	20.600000	13.400000	5.400000	13.200000	38.600000
75%	5.000000	25.925000	29.625000	47.950000	43.400000	44.425000	44.450000	55.400000
max	5.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000

	Age Band	Academic Reputation Score	Employer Reputation Score	Faculty Student Score	Citations per Faculty Score	International Faculty Score	International Students Score	International Research Network Score	Employment Outcomes Score	Overall Score
count	1411.000000	1422.000000	1421.000000	1420.000000	1417.000000	1324.000000	1365.000000	1409.000000	1410.000000	500.000000
mean	4.008505	20.124684	20.657143	29.997113	24.529358	31.659517	26.545348	49.570121	26.186809	44.619400
std	0.965320	22.802706	24.027928	28.172207	27.910952	34.170817	30.896854	30.205439	26.201036	18.655057
min	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	24.200000
25%	3.000000	5.400000	4.400000	8.200000	3.100000	4.800000	3.300000	21.600000	6.700000	29.800000
50%	4.000000	10.800000	10.300000	18.250000	11.100000	13.750000	10.800000	47.700000	15.500000	38.550000
75%	5.000000	23.775000	27.000000	43.500000	39.400000	55.075000	40.500000	77.600000	36.900000	54.500000
max	5.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000

QS World University Rankings Analysis Project

Weike ZHANG

Nov 2024

Project Outline¶

Introduction¶

Objectives:¶

Data and Summary Statistics¶

I. Data Sources (Extraction, Transform, and Load)¶

II. Summary Statistics¶

Measure and Variable Definition¶

Exploratory Data Analysis (EDA)¶

I. Ranking Trends¶

II. Metric Correlations¶

III. Geographic Trends¶

IV. Internationalization¶

Empirical Results¶

I. Regression Analysis¶

II. Predictive Modelling¶

Conclusion and Implications¶

Additional Sections:¶

Appendices:¶

References:¶

Data and Summary Statistics¶

I. Data Sources (Extraction, Transform, and Load)¶

QS World University Rankings Metrics Explained¶

II. Summary Statistics¶

Measure and Variable Definition¶

In This Section:¶

Exploratory Data Analysis (EDA)¶

I. Ranking Trends¶

II. Correlation Analysis of QS Ranking Metrics¶

III. Geographic Distribution of QS Ranked Universities¶

IV. Internationalization¶

Empirical Results¶

I. Regression Analysis¶

II. Predictive Modelling¶

Reference¶

	Academic Reputation Score	Employer Reputation Score	Faculty Student Score	Citations per Faculty Score	International Faculty Score	International Students Score	International Research Network Score	Employment Outcomes Score	Sustainability Score	Overall Score
count	1498.000000	1497.000000	1474.000000	1474.000000	1372.000000	1418.000000	1494.000000	1474.000000	1398.000000	602.000000
mean	20.132043	19.806880	28.643894	23.940163	30.948834	25.575035	23.967938	20.016961	25.412017	40.879900
std	22.365895	23.764625	27.843868	28.075573	34.247562	30.867149	30.371277	20.241410	31.010557	19.181335
min	1.600000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	19.800000
25%	6.000000	4.100000	7.500000	2.800000	4.300000	3.000000	1.200000	8.225000	1.400000	25.700000
50%	10.900000	9.500000	16.750000	10.400000	13.050000	9.850000	6.850000	11.700000	8.400000	34.550000
75%	23.100000	25.500000	41.900000	37.900000	52.725000	38.075000	40.375000	22.475000	42.525000	51.300000
max	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000