stastics.py 932 Bytes
import pyarrow.parquet as pq
import pandas as pd
import matplotlib.pyplot as plt
category='NIRW19'
table = pq.read_pandas(f"parquet/{category}", columns=['author', 'publisher', 'date', 'topic'])
print(table.schema)
df = table.to_pandas()

publisher=df.groupby('publisher').count()['author']
topic = df.groupby('topic').count()['author']
month = df.groupby(pd.Grouper(key='date', freq='M')).count()['author']
plt.rc('font', family='BATANG')
pub_ax = publisher.plot(kind="bar")
plt.title(f"{category}-Publisher")
for i, v in enumerate(publisher):
    pub_ax.text(i-0.25, v, v)
plt.show()
plt.title(f"{category}-Topic")
top_ax=topic.plot(kind="bar")
for i, v in enumerate(topic):
    top_ax.text(i-0.25, v, v)
plt.show()
plt.title(f"{category}-Month")
month.plot()
plt.show()
plt.title(f"{category}-Publisher")
publisher.plot.pie(autopct='%1.1f%%')
plt.show()
plt.title(f"{category}-Topic")
topic.plot.pie(autopct='%1.1f%%')
plt.show()