More than 8 years of messages analysis

Total messages by Day

I downloaded my data from Facebook in a .json format. I used Python with Jupyter Notebook to play with data.

The json and pandas libraries are very useful to read and display data in a stylized way.

I added some columns to the data such as date (since the available time was a millisecond timestamp) and total characters for each row (each row represents a sent message).

Finally, I exported the data as an .xls file and opened it with Tableau to make the graphs.

# data time range : 2010-10-24 to 2019-07-05
# Imports

import json
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import operator
fileName = 'message_1.json'
orderedDays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
with open(fileName) as jsonFile:  
    data = json.load(jsonFile)
    dataList = []
    words = {}
    for index, message in enumerate(data['messages']):
        row = []
        row.append('Me' if message["sender_name"] == 'Mokhles Bouzaien' else 'Her')
        row.append(message['timestamp_ms'])
        try:
            row.append(message['content'])
        except KeyError:
            row.append(None)
            
        try:
            wordList = message["content"].split()
        except KeyError:
            pass
            
        for word in wordList:
            try:
                words[word] += 1
            except KeyError:
                words[word] = 1
                
        dataList.append(row)
dataList = np.array(dataList)
df = pd.DataFrame(dataList,columns=['sender', 'timestamp_ms', 'content'])
df['date'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000), axis = 1)
df['year'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000).year, axis = 1)
df['month'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000).month, axis = 1)
df['day'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000).day, axis = 1)
df['weekday'] = df.apply(lambda row: datetime.date.fromtimestamp(row.timestamp_ms / 1000).strftime('%A'), axis = 1)
df['hour'] = df.apply(lambda row: datetime.datetime.fromtimestamp(row.timestamp_ms / 1000).hour, axis = 1)
df['caracters'] = df.apply(lambda row: len(row.content) if row.content != None else 0, axis = 1)
df.head()

sendertimestamp_mscontentdateyearmonthdayweekdayhourcaracters
0Her1562350465800Hidden Message2019-07-05201975Friday2051
1Her1562350430104Hidden Message2019-07-05201975Friday2033
2Her1562350413998Hidden Message2019-07-05201975Friday2043
3Me1562350377019Hidden Message2019-07-05201975Friday2012
4Me1562350371595Hidden Message2019-07-05201975Friday208
# Data is exported as .slsx file and imported to Tableau
df.to_excel("output.xlsx", engine='xlsxwriter')
sortedWords = sorted(words.items(), key=operator.itemgetter(1), reverse=True)
# group data by sender
dfBySender = df.groupby('sender')['caracters'].agg(['sum', 'count'])
# dfBySender = pd.DataFrame(dfBySender).reset_index()
dfBySender.head()

sumcount
sender
Her6351619210679
Me2953745152407
# dfBySender.plot.pie(y='count', figsize=(5, 5))
# group data by date
dfByDate = df.groupby('date')['caracters'].agg(['sum', 'count'])
dfByDate.head()

sumcount
date
2010-10-24432
2010-10-2691
2010-10-27131
2010-12-17191
2011-03-31165
print(dfByDate.loc[dfByDate['sum'].idxmax()])
print(dfByDate.loc[dfByDate['count'].idxmax()])
print(dfByDate['sum'].mean())
print(dfByDate['count'].mean())
sum      36354
count      346
Name: 2012-07-23, dtype: int64
sum      22975
count     1171
Name: 2018-08-20, dtype: int64
3553.0217640320734
138.63535700649103
# plt.figure(figsize=(16,12))
# dfByDate['sum'].plot()
# plt.figure(figsize=(16,12))
# dfByDate['count'].plot()
# plt.figure(figsize=(16,6))
# plt.subplot(121)
# dfByDate.cumsum()['sum'].plot()
# plt.subplot(122)
# dfByDate.cumsum()['count'].plot()
# group data by month
dfByMonth = df.groupby(['year','month'])['caracters'].agg(['sum', 'count'])
dfByMonth = dfByMonth.reset_index()
dfByMonth.head()

yearmonthsumcount
0201010654
1201012191
220113165
3201172039154
42011817722
# sns.set()
# caracters = dfByMonth.pivot("month", "year", "sum")
# messages = dfByMonth.pivot("month", "year", "count")
# f, ax = plt.subplots(figsize=(16, 12))
# sns.heatmap(caracters, annot=False, ax=ax, cmap='YlGnBu')
# f, ax = plt.subplots(figsize=(16, 12))
# sns.heatmap(messages, annot=False, ax=ax, cmap='YlGnBu')
# group data by year
dfByYear = df.groupby('year')['caracters'].agg(['sum', 'count'])
dfByYear = pd.DataFrame(dfByYear).reset_index()
dfByYear.head()

yearsumcount
02010845
120111807876445
22012197659053485
3201381797826960
4201469870622401
# group data by weekday
dfByWeekday = df.groupby('weekday')['caracters'].agg(['sum', 'count'])
dfByWeekday = dfByWeekday.reindex(orderedDays)
dfByWeekday = dfByWeekday.reset_index()
dfByWeekday

weekdaysumcount
0Monday138327752854
1Tuesday126940550000
2Wednesday124804948457
3Thursday117911946607
4Friday138742654655
5Saturday123729049303
6Sunday160079861210
# # group data by date and weekday
# dfByDateWeekday = df.groupby(['date', 'weekday'])['caracters'].agg(['sum', 'count'])
# dfByDateWeekday = dfByDateWeekday.reset_index()
# dfByDateWeekday.head()
# sns.catplot(x="weekday", y="count", order=orderedDays, data=dfByDateWeekday, height=12)
# sns.catplot(x="weekday", y="count", order=orderedDays, kind='bar', data=dfByWeekday, height=12)
# sns.boxplot(x="weekday", y="count", data=dfByDateWeekday)
# group data by hour
dfByHour = df.groupby('hour')['caracters'].agg(['sum', 'count'])
dfByHour = dfByHour.reset_index()
dfByHour.head()

hoursumcount
0062984226814
1141086717451
222392919799
331768467712
441230765105
# dfByHour['count'].plot.bar()

Reddit Post: https://www.reddit.com/r/dataisbeautiful/comments/caa9il/more_than_8_years_of_messages_between_my/

Mokhles Bouzaien
Mokhles Bouzaien
Master of Science in Engineering Student

A self-motivated graduate student in Engineering at IMT Atlantique.

Next