Home > AI > Uncategorized

Extract Named Entity Recognition from raw text

Some concept code snippets from nltk official site, but question remains.

# Tutorial link: https://www.nltk.org/book/ch07.html#ref-ie-postag

import nltk
import re
import pprint
import tensorflow as tf


flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('experiment', '', '')


# Information Extraction

# 1 structured data
def structured_data():
    locs = [('Omnicom', 'IN', 'New York'),
            ('DDB Needham', 'IN', 'New York'),
            ('Kaplan Thaler Group', 'IN', 'New York'),
            ('BBDO South', 'IN', 'Atlanta'),
            ('Georgia-Pacific', 'IN', 'Atlanta')]
    query = [e1 for (e1, rel, e2) in locs if e2 =='Atlanta']
    print(query)



# 2 unstructured data
# Flowchat:
# raw text
# ==> (sentence segmentation) ==> sentences
# nltk.sent_sentence

# => (tokenization) ==> words
# nltk.word_tokenize

# ==> (part of speech tagging) ==> pos-tagged sentences
# nltk.pos_tag

# ==> (entity detection) ==> chunked sentences
# ==> (relation detection) ==> relations

# Applications:
# business intelligence, resume harvesting, media analysis, sentiment detection, patent search, and email scanning.


def preprocess(document):
    sentences = nltk.sent_tokenize(document)
    words = [nltk.word_tokenize(sent) for sent in sentences]
    tags = [nltk.pos_tag(sent) for word in words]


def pos_tag1():
    raw_text = 'Washington, DC, United States (VoA) – State Department spokesperson Heather Nauert on Saturday said she has withdrawn her name from consideration for the post of U.S. ambassador to the United Nations. In December, President Donald Trump had announced he was picking Nauert to fill the vacancy caused when Nikki Haley stepped down from that position, leaving at the end of 2018.'

    sentences = nltk.sent_tokenize(raw_text)
    words = [nltk.word_tokenize(sent) for sent in sentences]
    tags = [nltk.pos_tag(word) for word in words] # 词性标注
    print(tags)


# Chunking(句法分析)
# Experiment 1
def pos_tag2():
    tags = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]
    grammar = 'NP: {<DT>?<JJ>*<NN>}'
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(tags)
    print(result)
    result.draw()


# Experiment 2
def pos_tag3():
    tags = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"), ("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]
    grammar = r"""
        NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
        {<NNP>+}                # chunk sequences of proper nouns
        """
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(tags)
    print(result)
    result.draw()
'''
NP 名词短语
'''


# named entity recognition
def net1():
    sent = nltk.corpus.treebank.tagged_sents()
    print(sent)



def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)


    if FLAGS.experiment == 'structured_data':
        structured_data()
    elif FLAGS.experiment == 'pos_tag1':
        pos_tag1()
    elif FLAGS.experiment == 'pos_tag2':
        pos_tag2()
    elif FLAGS.experiment == 'pos_tag3':
        pos_tag3()
    elif FLAGS.experiment == 'net1':
        net1()
    else:
        print('specify experiment name: \nstructured_data / pos_tag1 / pos_tag2 / pos_tag3 / net1')


if __name__=='__main__':
    tf.app.run()

 

Related posts:

Leave a Reply