Source code for pdf2chem

# Imports
import chemdataextractor as cde

import cirpy
import time

import sys
import re
import pandas as pd
import os
import textract
from datetime import datetime

# future work - a choice for whether to query 3-letter words
# if curation_type == "Regular (ignores 3-letter words)":
#     regex_number = 4
# else:
#     regex_number = 3
# curation_type = "Exhaustive (includes 3-letter words)"

pdf_path = os.getcwd()
regex_number = 3
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    pdf_method = "pdfminer"
else:
    pdf_method = "pdftotext"

print("We\'ll use {} as the pdf extraction method.".format(pdf_method))

# element symbols and false positives hard-coded until stable hosting is found
element_symbols = ['h', 'he', 'li', 'be', 'b', 'c', 'n', 'o', 'f', 'ne', 'na',
                   'mg', 'al', 'si', 'p', 's', 'cl', 'ar', 'k', 'ca',
                   'sc', 'ti', 'v', 'cr', 'mn', 'fe', 'co', 'ni', 'cu',
                   'zn', 'ga', 'ge', 'as', 'se', 'br', 'kr', 'rb', 'sr',
                   'y', 'zr', 'nb', 'mo', 'tc', 'ru', 'rh', 'pd', 'ag',
                   'cd', 'in', 'sn', 'sb', 'te', 'i', 'xe', 'cs', 'ba',
                   'la', 'ce', 'pr', 'nd', 'pm', 'sm', 'eu', 'gd', 'tb',
                   'dy', 'ho', 'er', 'tm', 'yb', 'lu', 'hf', 'ta', 'w',
                   're', 'os', 'ir', 'pt', 'au', 'hg', 'tl', 'pb', 'bi',
                   'po', 'at', 'rn', 'fr', 'ra', 'ac', 'th', 'pa', 'u',
                   'np', 'pu', 'am', 'cm', 'bk', 'cf', 'es', 'fm', 'md',
                   'no', 'lr', 'rf', 'db', 'sg', 'bh', 'hs', 'mt', 'ds ',
                   'rg ', 'cn ', 'nh', 'fl', 'mc', 'lv', 'ts', 'og']

false_positives = ['reno', 'lower', 'format', 'lead', 'nci', 'cc', 'isi',
                   'doi', "\\'b", 'is', 'ph', 'mv', 'zone', 'based', 'on',
                   'final', 'kato', 'cm', 'life', 'versus', 'www', 'can',
                   'ate', 'mm', 'crystal', 'sem', 'an', 's1', 'force', 'may',
                   'any', 'lau', 'voltage', 'kc', 'mino', 'm. h.', 'set',
                   'selective', 'c.p.k.', 'same', 'page 10', 'm-1', 'ai',
                   'c1', 'm2', 'et', 'fulfill', 'dry', 'via', 'may', 'pka',
                   'any', 'edge', 'b.v.', 'final', 'rt', '2b', 'h.y.', 'y.k.',
                   'v.v.', 'w.y.', 'good', 'region', 'cycle', 'des', 'force',
                   'may', 'dsc', 'chcl', 'counter', 'van', 'see', 'best',
                   'green', 'equal', 'result', 'challenge', 'substance',
                   'spectrum', 'der', 'its', 'glass', 'all', 'new', 'mix',
                   'so', 'soc.', 'arm', 'nm', 'ran', 'enable', 'sd', 'saa',
                   'map', 'ac1', 'fab', 'act', 'b7', 'liu', 'check', 'dual',
                   'via', 'den', 'fc', 'if', 'rapid', 'san', 'van', 'control',
                   'see', 'harry', 'adam', 'line', 'ac-1', 'sig', 'recruit',
                   'bli', 'test', 'tau', 'acs', 'iap', 'box', 'campaign',
                   'target', 'gfp', 'new', 'cv', 'rt', 'lid', 'compound',
                   'selective', 'rfb', 'ment', 'est', 'mm', 'con', 'con-',
                   's4', 'harry', 'ip', 'lp', 'ple', 'ml', 'prone', 'pka',
                   'sum', 'derivative', 'ten', 'min', 'vortex', 'gradual',
                   'tot', 'ber', 'red', 'ing', 'para', 'phs', 'gen', 'dft',
                   'nals', 'enable', 'set', 'versus', 'ma', 'the', 'and',
                   'eo', 'cps', 'ep', 'are', 'same', 'cos', 'age', 'sem',
                   's4', 'cycle', 'far', 'cal', 'overall', 'net', 'et', 'ml',
                   's1', 'prone', 'capture', 'or', 'rise', 'but', 'diurnal',
                   'dry', 'may', 'of', 'off', 'dp', 'if', 'dants', 'van',
                   'eden', 'line', 'tx', 'top', 'va', 'per', 'ny', 'on',
                   'ing', 'cp', 'for', 'dc', 'air', 'nhe', 'gas', 'zonal',
                   'all', 'new', 'based', 'had', 'ph', 'cm3', 'pyrite', 'soc',
                   'ser', 'acc', 'res', 'eds', 'mp', 'pro', 'inc', 'im', 'bv',
                   'disodium', 'ab', 'ed', 'carboxylate', '1mm', 'nat', 'eq',
                   'acc', 'sci', 'mol', 'int', 'sc-s', 'scs', 'gu', 'atm',
                   'shi', '2az', 'abbott', 'ms', 'wang', 'pdc', 'franklin',
                   'bay', 'dess', 'hbd', 'retard', 'intercept', 'iii',
                   'acid', 'fraction', 'aldrich', 'triton', 'cda', 'cyano',
                   'vinyl', 'flux', 'ethyl', 'methyl', 'mit', 'trigger',
                   'accelerate', 'ants', 'pentyl', 'laser', 'india', 'dos',
                   'los', 'acetyl', 'dec', 'sheets', 'tem', 'dimethyl',
                   'serial', 'tag', 'tandem', 'trap', 'mic', 'exciton',
                   'aldehyde', 'combat', 'roi', 'probiotic', 'antiviral',
                   'cada', 'beam', 'austin', 'lactone', 'lumen', 'diethyl',
                   'optimal', 'sulfoxide', 'gm3', 'gel', 'blockade', 'omega',
                   'cubes', 'bin', 'alcohols', 'alcohol', 'benchmark',
                   'portal', 'matrix', 'apex', 'bacterial', 'cube', 'linker',
                   'cascade', 'optimum', 'carbonyl oxygen', 'facet', 'shield']

if regex_number == 3:
    false_positives = [word for word in false_positives if not
                       re.search("[a-zA-Z0-9+-]{3}", word) or
                       re.search("[a-zA-Z0-9+-]{4}", word)]

"""# Define functions"""

[docs]def quick_curate(pdf_path, pdf_method, false_positives, regex_number):

    # extract the text from the pdf
    # the pdf_method should adapt to both local and hosted
    # runtime compatibility
    text = textract.process(pdf_path, method=pdf_method)

    # queue up and reset list used to process the paper
    temp_word_list = []

    # strip new line and other markup from pdf mining
    text = str(text).replace("\\n-", '').replace('\-\n', '')
    text = str(text).replace('\-\n-', '').replace('\\n', ' ')
    text = str(text).replace('\n', ' ').replace('.', '')
    text = str(text).replace('*', "").replace('ISSN', '').replace('NSF', '')
    text = str(text).replace('NIH', '').replace("b'", '').replace(r"\r", '')

    # split by white spaces
    temp_word_list = re.split("\s+", str(text))

    # try to remove reference section by cutting off everything after the last
    # mention of reference
    ref = [i for i, w in enumerate(temp_word_list) if
           w.lower().startswith('reference')]
    #print(ref)
    try:
        temp_word_list = temp_word_list[:(ref[-1])]
    except Exception as e:
        pass

    # reconnect any words that got hyphenated and cut off at the end
    # of a column
    for i, word in enumerate(temp_word_list):
        if re.search('[-]+$', word):
            temp_word_list[i] = word.replace('-', '') + temp_word_list[i+1]
            del(temp_word_list[i+1])

    print('The initial list for {} has {} words.'.format(pdf_path,
                                                         len(temp_word_list)))

    # reconstruct a text string from the cleaned list,
    # as cde's NLP works on strings
    cleaned_text = ''
    for word in temp_word_list:
        cleaned_text += word
        cleaned_text += ' '

    # have cde do NLP on the string and convert the results
    # into a list of strings
    doc = cde.Document(cleaned_text)
    chemicals_all = [span for span in doc.cems]
    chem_strings = [str(word).lower().replace('\n', ' ')
                    for word in chemicals_all]

    # remove any blanks or null values
    chem_strings = [word for word in chem_strings if word]

    # remove anything left with a backslash in it
    chem_strings = [word for word in chem_strings if not
                    re.search('[\\\+]', word)]

    print('We\'ll attempt to resolve {} \
          potential chemicals.'.format(len(chem_strings)))

    # reset lists used for processing query hits and misses
    smiles_list = []
    already_queried = []
    missed_items = []

    for item in chem_strings:

        # if Sn is found, it's probably tin, not S=C
        if item.lower() == "sn":
            smiles_list.append('SnH4')
            print(item, smiles_list[-1])
            continue

        # keeping element symbols, such as H, C, or Na
        # this may turn into an option
        if item in element_symbols:
            smiles_list.append(cirpy.resolve(item, 'smiles'))
            print(item, smiles_list[-1])
            continue

        # Future work - include options for user to specify exclusion of
        # 3-letter words
        # adapt the regex code that leaves out short words/abbreviations
        # to the user input above
        # if regex_number == 4:
        #
        #     if not re.search("[a-zA-Z0-9+-]{4}", item):
        #         smiles_list.append(None)
        #         print('Found a word that\'s a\
        #               likely false positive: {}'.format(item))
        #         missed_items.append(item)
        #         continue

        if regex_number == 3:

            if not re.search("[a-zA-Z0-9+-]{3}", item):
                smiles_list.append(None)
                print('Found a word that\'s a likely false \
                      positive: {}'.format(item))
                missed_items.append(item)
                continue

        # save time by not querying chemicals that are in the text many times
        if item in already_queried:
            smiles_list.append(None)
            print('We\'ve already queried this one: {}'.format(item))


        # don't query the chemical if it's a known false positive
        # these include author names and a few other odds and ends
        elif item.strip('.').strip(',').lower() in false_positives:
            smiles_list.append(None)
            print('Found one known to be a false positive: {}'.format(item))

        # if the item passes all the tests,
        # attempt to resolve it via NIH's CIR
        else:
            try:
                smiles_list.append(cirpy.resolve(item, 'smiles'))
                print(item, smiles_list[-1])
                time.sleep(0.21)

                # except loop in here to account for
                # internet stability issues and the like
            except Exception as e:
                try:
                    print(e)
                    print('Exception raised.  Pausing for \
                          2 seconds and trying again')
                    time.sleep(2)
                    smiles_list.append(cirpy.resolve(item, 'smiles'))
                    print(smiles_list[-1])
                except Exception as e:
                    try:
                        print(e)
                        print('Exception raised.  Pausing for another \
                              2 seconds and trying again')
                        time.sleep(2)
                        smiles_list.append(cirpy.resolve(item, 'smiles'))
                        print(smiles_list[-1])
                    except Exception as e:
                        try:
                            print(e)
                            print('Exception raised.  Pausing for one more \
                                  stretch and trying again')
                            time.sleep(2)
                            smiles_list.append(cirpy.resolve(item, 'smiles'))
                            print(smiles_list[-1])
                        except Exception as e:
                            print(e)
                            print('It still raised an exception.  Here\'s \
                                  how far it got:')
                            print(smiles_list)
                            print(len(smiles_list))
                            print('This item will be added to a list \
                                  called missed items.')
                            print(item)
                            smiles_list.append('Check')
                            missed_items.append(item)
        already_queried.append(item)

    # tidy these up into pandas dataframes and export them as csv files
    chem_df = pd.DataFrame(zip(chem_strings, smiles_list),
                           columns=('Name', 'SMILES'))
    chem_df = chem_df.dropna()
    chem_df.to_csv(os.path.splitext(pdf_path)[0]+'_'+datetime.today().
                   strftime('%Y%m%d')+'_names_and_SMILES.csv')
    if missed_items:
        missed_df = pd.DataFrame(missed_items, columns=['Missed'])
        missed_df = missed_df.drop_duplicates()
        missed_df.to_csv(os.path.splitext(pdf_path)[0]+'_'+datetime.
                         today().strftime('%Y%m%d')+'_zzz_missed_items.csv')

[docs]def aggregate_csv_files():
    # combines all results files into a single csv file
    all_chemicals = pd.concat([pd.read_csv(filename) for filename in
                               os.listdir(pdf_dir) if
                               re.search('csv$', filename)])
    all_chemicals.to_csv(datetime.today().
                         strftime('%Y%m%d')+"combined_csv.csv",
                         index=False, encoding='utf-8-sig')

"""# Curate pdfs"""

#@title ## Curator output will appear below
[docs]def curate_folder(pdf_dir = os.getcwd()):
    """
    Extract known chemicals from a folder of pdf files, and export a
    .csv file of SMILESstrings, a machine-readable chemical format
    for each file and a combined .csv for all the pdf files.

    Extended Summary
    ----------------
    Extract text from a pdf file.  Use chemdataextractor's NLP to identify
    chemical entities. Attempt to resolve each entity at NIH's CACTVS service.
    Organize chemicals recognized by PubChem into a dataframe.
    Export the chemical names and SMILES strings as a .csv files
    Repeat for each pdf file in the folder

    Parameters
    ----------
    pdf_dir : string, optional
        path to a folder of pdf files (the default is the current working
        directory)
    """

    pd.DataFrame(data=None, columns=('Name', 'SMILES'))

    assert os.path.exists(pdf_dir), "I did not find the \
    directory at, "+str(pdf_dir)

    os.chdir(pdf_dir)

    for filename in os.listdir(pdf_dir):
        if re.search('pdf$', filename):
            try:
                chemicals = quick_curate(filename, pdf_method,
                                         false_positives, regex_number)
            except Exception as e:
                print('An exception was raised for ' + filename)
                print(e)

    try:
        aggregate_csv_files()
    except Exception as e:
        print(e)
        "An error occurred while trying to combine the output csv files."