import pandas as pd
# chose data from Pittsburgh
pittReviews = pd.read_json("data/reviews_pittsburgh.json.gz", orient="records", lines=True)


import re

def format_text(row):
    formatted_row = row.lower().split()
    return formatted_row

formatted_text = [format_text(row) for row in pittReviews['text']]


import numpy as np

formatted_text_df = pd.DataFrame(np.column_stack([formatted_text]))
formatted_text_df.columns=['formatted_text']
pittReviews = pittReviews.copy()
pittReviews = pd.concat([pittReviews, formatted_text_df], axis=1, ignore_index=False)

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\numpy\lib\shape_base.py:652: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  arr = asanyarray(v)


import nltk
nltk.download('stopwords')
stop_words = list(set(nltk.corpus.stopwords.words('english')))

import string
punctuation = list(string.punctuation)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\m1861\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ignored = stop_words + punctuation

def removeStop_text(row):
    noStop_row = [word for word in row if word not in ignored]
    return noStop_row

noStop_text = [removeStop_text(row) for row in pittReviews['formatted_text']]

noStop_text_df = pd.DataFrame(np.column_stack([noStop_text]))
noStop_text_df.columns=['formatted_text']
pittReviews = pittReviews.iloc[:, 0:4]
pittReviews = pd.concat([pittReviews, noStop_text_df], axis=1, ignore_index=False)

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\numpy\lib\shape_base.py:652: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  arr = asanyarray(v)


import textblob
def getBlob_text(row):
    blob = textblob.TextBlob(' '.join(row))
    return blob

blobs = [getBlob_text(row) for row in pittReviews['formatted_text']]


data = {}
data['polarity'] = [blob.sentiment.polarity for blob in blobs]
data['subjectivity'] = [blob.sentiment.subjectivity for blob in blobs]
data = pd.DataFrame(data)


pittReviews = pd.concat([pittReviews, data], axis=1, ignore_index=False)


import seaborn as sns
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=pittReviews["stars"], y=pittReviews["polarity"]).set_title('polarity by the number of user stars')


sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=pittReviews["stars"], y=pittReviews["subjectivity"]).set_title('subjectivity by the number of user stars')


pittReviews_random = pittReviews.sample(1000)


def reshape_data(review_subset):
    """
    Reshape the input dataframe of review data.
    """
    from pandas import Series, merge
    
    X = (review_subset['formatted_text']
         .apply(Series)
         .stack()
         .reset_index(level=1, drop=True)
         .to_frame('word'))
    
    
    R = review_subset[['polarity', 'stars', 'review_id']]
    
    
    return merge(R, X, left_index=True, right_index=True).reset_index(drop=True)


reshaped_pittReviews_random = reshape_data(pittReviews_random)


word_size = pd.DataFrame(reshaped_pittReviews_random.groupby(['word']).size().reset_index(name='N'))
word_avg = pd.DataFrame(reshaped_pittReviews_random.groupby(['word']).mean().reset_index())
word = pd.concat([word_size[['word', 'N']], word_avg[['polarity', 'stars']]], axis=1, ignore_index=False)


word_over50 = word.loc[word['N']>=50]


import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(20, 15))

for i in word_over50.index:
    word = word_over50['word'][i]
    pol = word_over50['polarity'][i]
    sta = word_over50['stars'][i]
    
    plt.scatter(pol, sta, marker='o', c='pink')
    plt.text(pol+0.001, sta+0.001, word, size=18)

plt.xlabel("average polarity", fontsize=20)
plt.xticks(fontsize=14)
plt.ylabel("average stars", fontsize=20)
plt.yticks(fontsize=14)

plt.axvline(x=pittReviews['polarity'].mean(), linestyle='--', label="average polarity")
plt.axhline(y=pittReviews['stars'].mean(), linestyle='--', c='green', label="average stars")

plt.legend(loc='upper left')

plt.title('average stars and polarity for words', size=22)

Text(0.5, 1.0, 'average stars and polarity for words')


import cenpy
acs = cenpy.remote.APIConnection("ACSDT5Y2018")


variables = [
    "NAME",
    "B19013_001E", # MEDIAN HOUSEHOLD INCOME IN THE PAST 12 MONTHS
]

pitt_county_code = "003"
pa_state_code = "42"


pitt_inc_data = acs.query(
    cols=variables,
    geo_unit="block group:*",
    geo_filter={"state": pa_state_code, 
                "county": pitt_county_code, 
                "tract": "*"},
)

for variable in variables:
    # Convert all variables EXCEPT for NAME
    if variable != "NAME":
        pitt_inc_data[variable] = pitt_inc_data[variable].astype(float)


acs.set_mapservice("tigerWMS_ACS2018")

Connection to American Community Survey: 5-Year Estimates: Detailed Tables 5-Year(ID: https://api.census.gov/data/id/ACSDT5Y2018)
With MapServer: Census ACS 2018 WMS


acs.mapservice.layers[8]

# Use SQL to return geometries only for Allegheny County in PA
where_clause = f"STATE = {pa_state_code} AND COUNTY = {pitt_county_code}"

# Query for block groups
pitt_tracts = acs.mapservice.layers[8].query(where=where_clause)

pitt_inc_final = pitt_tracts.merge(
    pitt_inc_data,
    left_on=["STATE", "COUNTY", "TRACT"],
    right_on=["state", "county", "tract"],
)

C:\Users\m1861\miniconda3\envs\musa-550-fall-2021\lib\site-packages\pyproj\crs\crs.py:68: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))


fig, ax = plt.subplots(figsize=(10,10))

# Plot the choropleth
pitt_inc_final.plot(ax=ax, column='B19013_001E', legend=True, legend_kwds=dict(loc="lower left"),
                    cmap='viridis', scheme="Quantiles", k=5)

# Format
ax.set_title("median household income of Pittsburgh by census tract", fontsize=16)
ax.set_axis_off()


pittRestaurants = pd.read_json("data/restaurants_pittsburgh.json.gz", orient="records", lines=True)

pittRestaurants

import geopandas as gpd
pittRestaurants_gdf = gpd.GeoDataFrame(
    pittRestaurants, geometry=gpd.points_from_xy(pittRestaurants.longitude, pittRestaurants.latitude), crs="EPSG:4326")

pittRestaurants_gdf = pittRestaurants_gdf.to_crs(epsg=3857)


import matplotlib.colors as colors
fig, ax = plt.subplots(figsize=(20,15))

# Plot the choropleth
pitt_inc_final.plot(ax=ax, column='B19013_001E', legend=True, legend_kwds=dict(loc="lower left"),
                    cmap='gray', scheme="Quantiles", k=5)

# Plot the choropleth
pittRestaurants_gdf.plot(ax=ax, column='stars', legend=True, alpha=0.8, cmap='coolwarm')

[xmin, ymin, xmax, ymax] = pitt_inc_final.total_bounds

ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.set_axis_off()

ax.set_title("restaurants on the income map", fontsize=16)

Text(0.5, 1.0, 'restaurants on the income map')


merged_reviews = pittRestaurants_gdf.merge(pittReviews,
                                          on='business_id')


# create the axes
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,15))

# Extract out the x/y coordindates of the Point objects
xcoords = merged_reviews.geometry.x
ycoords = merged_reviews.geometry.y

# Add the zillow geometry boundaries
pitt_inc_final.plot(ax=ax1, facecolor="none", edgecolor="gray", linewidth=0.5)
pitt_inc_final.plot(ax=ax2, facecolor="none", edgecolor="gray", linewidth=0.5)

# Plot a hexbin chart
hex_vals1 = ax1.hexbin(xcoords, ycoords, C=merged_reviews['polarity'], gridsize=50, cmap='coolwarm')
hex_vals2 = ax2.hexbin(xcoords, ycoords, C=merged_reviews['stars_y'], gridsize=50, cmap='coolwarm')

# add a colorbar and format
cb1 = fig.colorbar(hex_vals1, ax=ax1, location="bottom")
cb1.set_label('polarity')
cb2 = fig.colorbar(hex_vals2, ax=ax2, location="bottom")
cb2.set_label('stars')

ax1.set_axis_off()
ax2.set_axis_off()

ax1.set_title('hex bins giving the polarity of the restaurant review', size=22)
ax2.set_title('hex bins giving the number of stars of the restaurant', size=22)

Text(0.5, 1.0, 'hex bins giving the number of stars of the restaurant')

Sentiment Analysis with Yelp Reviews¶

Part 1: testing how well sentiment analysis works.¶

Part 2: analyzing correlations between restaurant reviews and census data¶

1. Does Sentiment Analysis Work?¶

1.1 Load review data¶

1.2 Format the review text¶

1.3 Review stop words¶

1.4 Calculate polarity and subjectivity¶

1.5 Comparing the sentiment analysis to number of stars¶

1.6 The importance of individual words¶

1.6.1 Select a random sample of the review data¶

1.6.2 Re-format the data¶

1.6.3 Calculate the average number of stars and polarity for each word¶

1.6.4 Select words the occur at least 50 times in reviews¶

1.6.5 Plot the average polarity vs user stars¶

2. Correlating restaurant data and household income¶

2.1 Query the Census API¶

2.2 Download census tracts from the Census and merge the data from Part 2.1¶

2.3 Plot a choropleth map of the household income¶

2.4 Load the restaurants data¶

2.5 Overlay restaurants on the income map¶

2.6 Comparing polarity vs. stars geographically¶