Code for Zipf's Word Frequency Plot with Python Tutorial


View on Github

zipf_curve.py

# Imports
import os
from matplotlib import pyplot as plt
import string
import numpy as np
from scipy.interpolate import make_interp_spline

# define some dictionaries
texts = {}
textlengths = {}
textwordamounts = {}

unwantedCharacters = list(string.punctuation)

# How many ranks well show
depth = 10
xAxis = [str(number) for number in range(1, depth+1)]

# Getting all files in text folder
filePaths = os.listdir('texts')

# Getting text from .txt files in folder
for path in filePaths:
    with open(os.path.join('texts', path), 'r', encoding='UTF-8') as f:
        texts[path.split('.')[0]] = f.read()


# Cleaning and counting the Text
for text in texts:

    # Remove unwanted characters from the texts
    for character in unwantedCharacters:
        texts[text] = texts[text].replace(character, '').lower()

    splittedText = texts[text].split(' ')

    # Saving the text length to show in the label of the line later
    textlengths[text] = len(splittedText)

    # Here will be the amount of occurence of each word stored
    textwordamounts[text] = {}

    # Loop through all words in the text
    for i in splittedText:

        # Add to the word at the given position if it already exists
        # Else set the amount to one essentially making a new item in the dict
        if i in textwordamounts[text].keys():
            textwordamounts[text][i] += 1
        else:
            textwordamounts[text][i] = 1

    # Sorting the dict by the values with sorted
    # define custom key so the function knows what to use when sorting
    textwordamounts[text] = dict(
        sorted(
            textwordamounts[text ].items(),
            key=lambda x: x[1],
            reverse=True)[0:depth]
        )

# Get the percentage value of a given max value
def percentify(value, max):
    return round(value / max * 100)

# Generate smooth curvess
def smoothify(yInput):

    x = np.array(range(0, depth))
    y = np.array(yInput)

    # define x as 600 equally spaced values between the min and max of original x
    x_smooth = np.linspace(x.min(), x.max(), 600) 

    # define spline with degree k=3, which determines the amount of wiggle
    spl = make_interp_spline(x, y, k=3)
    y_smooth = spl(x_smooth)

    # Return the twe x and y axis
    return x_smooth, y_smooth

# Make the perfect Curve
ziffianCurveValues = [100/i for i in range(1, depth+1)]

x, y = smoothify(ziffianCurveValues)

plt.plot(x, y, label='Ziffian Curve', ls=':', color='grey')


# Plot the texts
for i in textwordamounts:
    maxValue = list(textwordamounts[i].values())[0]

    yAxis = [percentify(value, maxValue) for value in list(textwordamounts[i].values())]

    x, y = smoothify(yAxis)

    plt.plot(x, y, label=i+f' [{textlengths[i]}]', lw=1, alpha=0.5)

plt.xticks(range(0, depth), xAxis)

plt.legend()
plt.savefig('wordamounts.png', dpi=300)
plt.show()


pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy