https://ithaka-labs.s3.amazonaws.com/static-files/images/tdm/tdmdocs/CC_BY.png

Created by Nathan Kelber and Ted Lawless for JSTOR Labs under Creative Commons CC BY License
For questions/comments/improvements, email nathan.kelber@ithaka.org.


Tokenize Text Files with NLTK

Description: This notebook takes as input:

  • Plain text files (.txt) in a zipped folder called ‘texts’ in the data folder

  • Metadata CSV file called ‘metadata.csv’ in the data folder (optional)

and outputs a single JSON-L file containing the unigrams, bigrams, trigrams, full-text, and metadata.

Use Case: For Researchers (Mostly code without explanation, not ideal for learners)

Difficulty: Advanced

Completion time: 10-15 minutes

Knowledge Required:

Knowledge Recommended:

Data Format: .txt, .csv, .jsonl

Libraries Used:

  • os

  • json

  • NLTK

  • gzip

  • nltk.corpus

  • collections

  • pandas

Research Pipeline:

  1. Scan documents

  2. OCR files

  3. Clean up texts

  4. Tokenize text files (this notebook)


Import Libraries

import zipfile, os, nltk, json, gzip, pandas as pd
from nltk.corpus import PlaintextCorpusReader
from collections import Counter

Define Special Functions for this Process

### Various functions written for this notebook ###

def convert_tuple_bigrams(tuples_to_convert):
    """Converts NLTK tuples into bigram strings"""
    string_grams = []
    for tuple_grams in tuples_to_convert:
        first_word = tuple_grams[0]
        second_word = tuple_grams[1]
        gram_string = f'{first_word} {second_word}'
        string_grams.append(gram_string)
    return string_grams

def convert_tuple_trigrams(tuples_to_convert):
    """Converts NLTK tuples into trigram strings"""
    string_grams = []
    for tuple_grams in tuples_to_convert:
        first_word = tuple_grams[0]
        second_word = tuple_grams[1]
        third_word = tuple_grams[2]
        gram_string = f'{first_word} {second_word} {third_word}'
        string_grams.append(gram_string)
    return string_grams

def convert_strings_to_counts(string_grams):
    """Converts a Counter of n-grams into a dictionary"""
    counter_of_grams = Counter(string_grams)
    dict_of_grams = dict(counter_of_grams)
    return dict_of_grams

def update_metadata_from_csv():
    """Uses pandas to grab additional metadata fields from a CSV file then adds them to the JSON-L file.
    Unused fields can be commented out."""
    title = df.loc[identifier, 'title']
    isPartOf = df.loc[identifier, 'isPartOf']
    publicationYear = str(df.loc[identifier, 'publicationYear'])
    doi = df.loc[identifier, 'doi']
    docType = df.loc[identifier, 'docType']
    provider = df.loc[identifier, 'provider']
    datePublished = df.loc[identifier, 'datePublished']
    issueNumber = str(df.loc[identifier, 'issueNumber'])
    volumeNumber = str(df.loc[identifier, 'volumeNumber'])
    url = df.loc[identifier, 'url']
    creator = df.loc[identifier, 'creator']
    publisher = df.loc[identifier, 'publisher']
    language = df.loc[identifier, 'language']
    pageStart = df.loc[identifier, 'pageStart']
    pageEnd = df.loc[identifier, 'pageEnd']
    placeOfPublication = df.loc[identifier, 'placeOfPublication']
    pageCount = str(df.loc[identifier, 'pageCount'])

    data.update([   
        ('title', title),
        ('isPartOf', isPartOf),
        ('publicationYear', publicationYear),
        ('doi', doi),
        ('docType', docType),
        ('provider', provider),
        ('datePublished', datePublished),
        ('issueNumber', issueNumber),
        ('volumeNumber', volumeNumber),
        ('url', url),
        ('creator', creator),
        ('publisher', publisher),
        ('language', language),
        ('pageStart', pageStart),
        ('pageEnd', pageEnd),
        ('placeOfPublication', placeOfPublication),
        ('pageCount', pageCount),
    ])

Unzip Texts Folder (optional)

### Extract Zip File of Texts ###
# The text file should extract into a folder
# called 'texts'

filename = './data/texts.zip'

try:
    corpus_zip = zipfile. ZipFile(filename)
    corpus_zip.extractall('./data/')
    corpus_zip.close()
    print('Zip file extracted successfully.')
except:
    print('No zip file detected. Upload your zip file to the data folder.')

Check for Metadata CSV (optional)

### Check for a metadata CSV file ###

csv_filename = 'metadata.csv'

if os.path.exists(f'./data/{csv_filename}'):
    csv_exists = True
    print('Metadata CSV found.')
else: 
    csv_exists = False
    print('No metadata CSV found.')

Import the Text Files into NLTK

### Establish root folder holding all text files ###
# Create corpus using all text files in corpus_root
from nltk.corpus import PlaintextCorpusReader
corpus_root = './data/texts'
corpus = PlaintextCorpusReader(corpus_root, '.*txt')
### Print all File IDs in corpus based on text file names ###
text_list = corpus.fileids()
print(f'Corpus created from: {text_list}')

Generate and Output Data to JSON-L File

If an old json-l file already exists, this process will overwrite it

For each text, this code will:

  1. Gather unigrams, bigrams, trigrams, and full text

  2. Compute word counts

  3. Check for additional metadata in a CSV file

  4. Write any data to JSON-L file

### Create the JSON-L file and gzip it ###

# For every text: 
# 1. Compute unigrams, bigrams, trigrams, and wordCount
# 2. Append the data to a JSON-L file
# After all data is written, compress the dataset using gzip
## **If the JSONL file exists, it will be overwritten**

# Define the file output name
output_filename = 'my_data.jsonl'

# Delete output files if they already exist
if os.path.exists(f'./data/{output_filename}'):
    os.remove(f'./data/{output_filename}')
    print(f'Overwriting old version of {output_filename}')

if os.path.exists(f'./data/{output_filename}.gz'):
    os.remove(f'./data/{output_filename}.gz')
    print(f'Overwriting old version of {output_filename}.gz\n')
                  

for text in text_list:
    
    # Create identifier from filename
    identifier = text[:-4]
    
    # Compute unigrams
    unigrams = corpus.words(text)
    unigramCount = convert_strings_to_counts(unigrams)
    
    # Compute bigrams
    tuple_bigrams = list(nltk.bigrams(unigrams))
    string_bigrams = convert_tuple_bigrams(tuple_bigrams)
    bigramCount = convert_strings_to_counts(string_bigrams)
    
    # Compute trigrams
    tuple_trigrams = list(nltk.trigrams(unigrams))
    string_trigrams = convert_tuple_trigrams(tuple_trigrams)
    trigramCount = convert_strings_to_counts(string_trigrams)
    
    # Compute fulltext
    with open(f'./data/texts/{text}', 'r') as file:
        fullText = file.read()
    
    # Calculate wordCount
    wordCount = 0
    for counts in unigramCount.values():
        wordCount = wordCount + counts
  
    # Create a dictionary `data` to hold each document's data
    # Including id, wordCount, outputFormat, unigramCount,
    # bigramCount, trigramCount, fullText, etc.
    data = {}
    
    data.update([
        ('id', identifier),
        ('outputFormat', ['unigram', 'bigram', 'trigram', 'fullText']),
        ('wordCount', wordCount),
        ('fullText', fullText),
        ('unigramCount', unigramCount), 
        ('bigramCount', bigramCount), 
        ('trigramCount', trigramCount)
    ])
    
    # Add additional metadata if there is a metadata.csv available
    if csv_exists == True:
        # Read in the CSV file and set the index
        df = pd.read_csv(f'./data/{csv_filename}')
        df.set_index('id', inplace=True)
        # Update Metadata
        update_metadata_from_csv()
        
    
    # Write the document to the json file  
    with open(f'./data/{output_filename}', 'a') as outfile:
        json.dump(data, outfile)
        outfile.write('\n')
        print(f'Text {text} written to json-l file.')

print('\n' + str(len(text_list)) + f' texts written to {output_filename}.')

Gzip the JSON-L file

# GZip dataset

f_in = open(f'./data/{output_filename}', 'rb')
f_out = gzip.open(f'./data/{output_filename}.gz', 'wb')
f_out.writelines(f_in)
f_out.close()
f_in.close()
print(f'Compression complete. \n{output_filename}.gz has been created.')