Tutorial results in obscure errors.

I've been following webstruct tutorial and I'm getting few peculiar errors. 
From the tutorial I end up with code along the lines of this:

    from itertools import islice
    import pkg_resources
    import webstruct


    def token_identity(html_token):
        return {'token': html_token.token}


    def token_isupper(html_token):
        return {'isupper': html_token.token.isupper()}


    def parent_tag(html_token):
        return {'parent_tag': html_token.parent.tag}


    def border_at_left(html_token):
        return {'border_at_left': html_token.index == 0}


    DATA_DIR = pkg_resources.resource_filename('project', 'data/business_annotated')


    def get_training():
        trees = webstruct.load_trees("{}/*.html".format(DATA_DIR), webstruct.WebAnnotatorLoader())
        trees = islice(trees, 0, 10)  # todo
        return trees


    def tokenize_training(trees):
        html_tokenizer = webstruct.HtmlTokenizer()
        tokens, labels = html_tokenizer.tokenize(trees)
        return tokens, labels


    def main():
        print('creating model...')
        model = webstruct.create_wapiti_pipeline(
            'company.wapiti',
            token_features=[token_identity, token_isupper, parent_tag, border_at_left],
            train_args='--algo l-bfgs --maxiter 50 --compact',
        )
        print('getting training data...')
        tokens, labels = tokenize_training(get_training())
        print('fitting training data...')
        model.fit(tokens, labels)
        print('starting extract...')
        ner = webstruct.NER(model)
        print(ner.extract_from_url('http://scrapinghub.com/contact'))

    if __name__ == '__main__':
        main()

The first error I get is TypeError when trying to use extract something with `ner`:

    Traceback (most recent call last):
      File "/home/dex/projects/project/project/spiders/test.py", line 54, in <module>
        main()
      File "/home/dex/projects/project/project/spiders/test.py", line 51, in main
        print(ner.extract_from_url('http://scrapinghub.com/contact'))
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 58, in extract_from_url
        return self.extract(data)
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 46, in extract
        groups = IobEncoder.group(zip(html_tokens, tags))
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/sequence_encoding.py", line 128, in group
        return list(cls.iter_group(data, strict))
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/sequence_encoding.py", line 136, in iter_group
        if iob_tag.startswith('I-') and tag != iob_tag[2:]:
    TypeError: startswith first arg must be bytes or a tuple of bytes, not str

It seems like python3 support issue as it's expects bytes but get a string?  

Second error is when trying to build a `ner` straight from model without fitting it first:


    def main():
        print('creating model...')
        model = webstruct.create_wapiti_pipeline(
            'company.wapiti',
            token_features=[token_identity, token_isupper, parent_tag, border_at_left],
            train_args='--algo l-bfgs --maxiter 50 --compact',
        )
        # print('getting training data...')
        # tokens, labels = tokenize_training(get_training())
        # print('fitting training data...')
        # model.fit(tokens, labels)
        # print('starting extract...')
        ner = webstruct.NER(model)
        print(ner.extract_from_url('http://scrapinghub.com/contact'))

Results in:

    Traceback (most recent call last):
      File "/home/dex/projects/project/project/spiders/test.py", line 53, in <module>
        main()
      File "/home/dex/projects/project/project/spiders/test.py", line 50, in main
        print(ner.extract_from_url('http://scrapinghub.com/contact'))
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 58, in extract_from_url
        return self.extract(data)
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 45, in extract
        html_tokens, tags = self.extract_raw(bytes_data)
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/model.py", line 67, in extract_raw
        tags = self.model.predict([html_tokens])[0]
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/sklearn/utils/metaestimators.py", line 54, in <lambda>
        out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/sklearn/pipeline.py", line 327, in predict
        return self.steps[-1][-1].predict(Xt)
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 211, in predict
        sequences = self._to_wapiti_sequences(X)
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 230, in _to_wapiti_sequences
        X = self.feature_encoder.transform(X)
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 313, in transform
        return [self.transform_single(feature_dicts) for feature_dicts in X]
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 313, in <listcomp>
        return [self.transform_single(feature_dicts) for feature_dicts in X]
      File "/home/dex/.virtualenvs/people/lib/python3.6/site-packages/webstruct/wapiti.py", line 308, in transform_single
        line = ' '.join(_tostr(dct.get(key)) for key in self.feature_names_)
    TypeError: 'NoneType' object is not iterable

The errors seem to be very vague and I don't even know where to start debugging this. Am I missing something?

I'm running:  
`webstruct` - 0.5  
`scikit-learn` - 0.18.2  
`scipy` - 0.19  
`libwapiti` - 0.2.1  


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Tutorial results in obscure errors. #40

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Tutorial results in obscure errors. #40

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions