Skip to content

Tutorial results in obscure errors. #40

@Granitosaurus

Description

@Granitosaurus

I've been following webstruct tutorial and I'm getting few peculiar errors.
From the tutorial I end up with code along the lines of this:

from itertools import islice
import pkg_resources
import webstruct


def token_identity(html_token):
    return {'token': html_token.token}


def token_isupper(html_token):
    return {'isupper': html_token.token.isupper()}


def parent_tag(html_token):
    return {'parent_tag': html_token.parent.tag}


def border_at_left(html_token):
    return {'border_at_left': html_token.index == 0}


DATA_DIR = pkg_resources.resource_filename('project', 'data/business_annotated')


def get_training():
    trees = webstruct.load_trees("{}/*.html".format(DATA_DIR), webstruct.WebAnnotatorLoader())
    trees = islice(trees, 0, 10)  # todo
    return trees


def tokenize_training(trees):
    html_tokenizer = webstruct.HtmlTokenizer()
    tokens, labels = html_tokenizer.tokenize(trees)
    return tokens, labels


def main():
    print('creating model...')
    model = webstruct.create_wapiti_pipeline(
        'company.wapiti',
        token_features=[token_identity, token_isupper, parent_tag, border_at_left],
        train_args='--algo l-bfgs --maxiter 50 --compact',
    )
    print('getting training data...')
    tokens, labels = tokenize_training(get_training())
    print('fitting training data...')
    model.fit(tokens, labels)
    print('starting extract...')
    ner = webstruct.NER(model)
    print(ner.extract_from_url('http://scrapinghub.com/contact'))

if __name__ == '__main__':
    main()

The first error I get is TypeError when trying to use extract something with ner:

Traceback (most recent call last):
  File "/home/dex/projects/project/project/spiders/test.py", line 54, in <module>
    main()
  File "/home/dex/projects/project/project/spiders/test.py", line 51, in main
    print(ner.extract_from_url('http://scrapinghub.com/contact'))
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 58, in extract_from_url
    return self.extract(data)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 46, in extract
    groups = IobEncoder.group(zip(html_tokens, tags))
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/sequence_encoding.py", line 128, in group
    return list(cls.iter_group(data, strict))
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/sequence_encoding.py", line 136, in iter_group
    if iob_tag.startswith('I-') and tag != iob_tag[2:]:
TypeError: startswith first arg must be bytes or a tuple of bytes, not str

It seems like python3 support issue as it's expects bytes but get a string?

Second error is when trying to build a ner straight from model without fitting it first:

def main():
    print('creating model...')
    model = webstruct.create_wapiti_pipeline(
        'company.wapiti',
        token_features=[token_identity, token_isupper, parent_tag, border_at_left],
        train_args='--algo l-bfgs --maxiter 50 --compact',
    )
    # print('getting training data...')
    # tokens, labels = tokenize_training(get_training())
    # print('fitting training data...')
    # model.fit(tokens, labels)
    # print('starting extract...')
    ner = webstruct.NER(model)
    print(ner.extract_from_url('http://scrapinghub.com/contact'))

Results in:

Traceback (most recent call last):
  File "/home/dex/projects/project/project/spiders/test.py", line 53, in <module>
    main()
  File "/home/dex/projects/project/project/spiders/test.py", line 50, in main
    print(ner.extract_from_url('http://scrapinghub.com/contact'))
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 58, in extract_from_url
    return self.extract(data)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 45, in extract
    html_tokens, tags = self.extract_raw(bytes_data)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 67, in extract_raw
    tags = self.model.predict([html_tokens])[0]
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/sklearn/utils/metaestimators.py", line 54, in <lambda>
    out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/sklearn/pipeline.py", line 327, in predict
    return self.steps[-1][-1].predict(Xt)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 211, in predict
    sequences = self._to_wapiti_sequences(X)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 230, in _to_wapiti_sequences
    X = self.feature_encoder.transform(X)
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 313, in transform
    return [self.transform_single(feature_dicts) for feature_dicts in X]
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 313, in <listcomp>
    return [self.transform_single(feature_dicts) for feature_dicts in X]
  File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 308, in transform_single
    line = ' '.join(_tostr(dct.get(key)) for key in self.feature_names_)
TypeError: 'NoneType' object is not iterable

The errors seem to be very vague and I don't even know where to start debugging this. Am I missing something?

I'm running:
webstruct - 0.5
scikit-learn - 0.18.2
scipy - 0.19
libwapiti - 0.2.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions