Skip to content

training shortlist on large-ish dataset #6

@cairomo

Description

@cairomo

hello, I have gotten this model working on training custom data. it works through the surrogate and extreme steps, but during the calculation of the ANN/shortlist, it seems to be trying to pickle an extremely large file. it throws this error:

File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/main.py", line 428, in main
    output = train(model, params)
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/main.py", line 107, in train
    surrogate_mapping=params.surrogate_mapping)
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/model.py", line 528, in fit
    use_intermediate_for_shorty, precomputed_intermediate)
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/model.py", line 337, in _fit
    self.save_checkpoint(model_dir, epoch+1)
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/model.py", line 695, in save_checkpoint
    model_dir, self.tracking.saved_checkpoints[-1]['ANN']))
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/shortlist.py", line 105, in save
    self.knn.save(fname+'.knn')
  File "/home/chmo/.local/lib/python3.7/site-packages/xclib-0.97-py3.7-linux-x86_64.egg/xclib/utils/shortlist.py", line 396, in save
    'space': self.space}, open(fname+".metadata", 'wb'))
OverflowError: cannot serialize a bytes object larger than 4 GiB

is this where it is saving the knn checkpoint? why is the object so large that it is > 4GB? I am wondering because in the paper it is reported that the model was trained on a dataset with a label space of 62 million. I am also running this training with p100 and 61 workers

stats on my dataset:
number of features: 111,786
number of labels: 699,001

and my config:

{
    "global": {
        "dataset": "leaf",
        "feature_type": "sparse",
        "num_labels": 699001,
        "arch": "Astec",
        "A": 0.55,
        "B": 1.5,
        "use_reranker": true,
        "surrogate_threshold": 512,
        "surrogate_method": 1,
        "embedding_dims": 391,
        "top_k": 100,
        "beta": 0.3,
        "save_predictions": true,
        "trn_label_fname": "trn_X_Y.txt",
        "val_label_fname": "tst_X_Y.txt",
        "tst_label_fname": "tst_X_Y.txt",
        "trn_feat_fname": "trn_X_Xf.txt",
        "val_feat_fname": "tst_X_Xf.txt",
        "tst_feat_fname": "tst_X_Xf.txt"
    },
    "surrogate": {
        "num_epochs": 20,
        "dlr_factor": 0.5,
        "learning_rate": 0.01,
        "batch_size": 255,
        "dlr_step": 14,
        "normalize": true,
        "init": "token_embeddings",
        "optim": "Adam",
        "embeddings": "fasttextB_embeddings_391d.npy",
        "validate": true,
        "save_intermediate": true
    },
    "extreme": {
        "num_epochs": 20,
        "dlr_factor": 0.5,
        "learning_rate": 0.007,
        "batch_size": 255,
        "dlr_step": 14,
        "ns_method": "ensemble",
        "num_centroids": 1,
        "efC": 300,
        "efS": 400,
        "M": 100,
        "num_nbrs": 500,
        "ann_threads": 18,
        "beta": 0.5,
        "surrogate_mapping": true,
        "num_clf_partitions": 1,
        "optim": "Adam",
        "freeze_intermediate": true,
        "validate": true,
        "model_method": "shortlist",
        "normalize": true,
        "shortlist_method": "hybrid",
        "init": "intermediate",
        "use_shortlist": true,
        "use_intermediate_for_shorty": true
    },
    "reranker": {
        "num_epochs": 20,
        "dlr_factor": 0.5,
        "learning_rate": 0.005,
        "batch_size": 255,
        "dlr_step": 10,
        "beta": 0.6,
        "num_clf_partitions": 1,
        "optim": "Adam",
        "validate": true,
        "model_method": "reranker",
        "shortlist_method": "static",
        "surrogate_mapping": true,
        "normalize": true,
        "use_shortlist": true,
        "init": "token_embeddings",
        "save_intermediate": false,
        "keep_invalid": true,
        "freeze_intermediate": false,
        "update_shortlist": false,
        "use_pretrained_shortlist": true,
        "embeddings": "fasttextB_embeddings_391d.npy"
    }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions