Profile Picture

Transformers Script Runs, but Breaks in PyCharm Debugger

Posted By xandercage.intellipaat 5 Years Ago
You don't have permission to rate!

Transformers Script Runs, but Breaks in PyCharm Debugger

Author
Message
xandercage.intellipaat
xandercage.intellipaat
Posted 5 Years Ago
View Quick Profile
New Member

New Member (80 reputation)New Member (80 reputation)New Member (80 reputation)New Member (80 reputation)New Member (80 reputation)New Member (80 reputation)New Member (80 reputation)New Member (80 reputation)New Member (80 reputation)

Group: Forum Members
Last Active: 5 Years Ago
Posts: 1, Visits: 2

I'm pursuing ai course and using the following script in debug mode to get a better understanding of the internal workings of the Transformers' model.generate() function. It's part of an API I'm building for a client, so ignore the Flask code - the key issue here is getting the debugger to work so I can follow the tokenized text through the model's generation process. Does the Transformers library break on debugging? Why might this be the case?:

import os
import shutilimport subprocessimport numpy as npimport torchfrom flask import Flask, requestfrom flask_restful import Api, Resource, reqparseimport transformersfrom transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2LMHeadModel, GPT2Tokenizer'''Model Imports'''app = Flask(__name__)api = Api(app)def get_model(model_dir):    if not os.path.exists(model_dir):        print(f'Building model directory at {model_dir}')        os.mkdir(model_dir)    try:        command = f'aws s3 sync AWS_BUCKET {model_dir}'        subprocess.call(command.split())    except:        print('AWS commandline call failed. Have you configured the AWS cli yet?')MODEL_DIR = "./model"if not os.path.exists(MODEL_DIR):    get_model(MODEL_DIR)NUM_PATTERN = r'\s\d+[A-Za-z]*'output_model_file = os.path.join(MODEL_DIR, WEIGHTS_NAME)output_config_file = os.path.join(MODEL_DIR, CONFIG_NAME)# Re-load the saved model and vocabularyprint('Loading model!')model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)tokenizer = GPT2Tokenizer.from_pretrained(MODEL_DIR)'''Arg Parser'''parser = reqparse.RequestParser()parser.add_argument('prompt', type=str, help='Main input string to be transformed. REQUIRED.', required=True)parser.add_argument('max_length', type=int, help='Max length for generation.', default=20)parser.add_argument('repetition_penalty', type=int, help='Penalty for word repetition. Higher = fewer repetitions.', default=5)parser.add_argument('length_penalty', type=int, help='Exponential penalty for length. Higher = shorter sentences.', default=1)parser.add_argument('num_beams', type=int, help='# Beams to use for beam search.', default=5)parser.add_argument('temperature', type=float, help='Temperature of the softmax operation used in generation.', default=3)parser.add_argument('top_k', type=int, help='Top words to select from during text generation.', default=50)parser.add_argument('top_p', type=float, help='Top-P for Nucleus Sampling. Lower = more restrictive search.', default=0.8)parser.add_argument('num_return_sequences', type=int, help='Number of sequences to generate.', default=1)def decode(output):    return str(tokenizer.decode(output, skip_special_tokens=True))class TransformerAPI(Resource):    def get(self):        args = parser.parse_args()        app.logger.info(f'Using model loaded from {MODEL_DIR}.')        ids = tokenizer.encode(args['prompt'])        inp = torch.tensor(np.array(ids)[np.newaxis, :])                #Account for generation limits < input value        if inp.shape[1] >= args['max_length']:            print(inp.shape[1])            print(args['max_length'])            result = inp[:, :args['max_length']]            print(result)            decoded = [decode(result.tolist()[0])] * args['num_return_sequences']            return {'completion': decoded,                    'model_used': MODEL_DIR}        else:            result = model.generate(input_ids=inp,                                    max_length=args['max_length'],                                    repetition_penalty=args['repetition_penalty'],                                    length_penalty=args['length_penalty'],                                    do_sample=True,                                    num_beams=args['num_beams'],                                    temperature=args['temperature'],                                    top_k=args['top_k'],                                    top_p=args['top_p'],                                    num_return_sequences=args['num_return_sequences'])            decoded = [decode(l.tolist()) for l in result]            return {'completion': decoded,                    'model_used': MODEL_DIR}api.add_resource(TransformerAPI, '/api/v1')if __name__ == '__main__':    #app.run(debug=True)    ids = tokenizer.encode('The present invention')    inp = torch.tensor(np.array(ids)[np.newaxis, :])    result = model.generate(input_ids=inp,                            max_length=15,                            repetition_penalty=5,                            length_penalty=1,                            do_sample=True,                            num_beams=5,                            temperature=3,                            num_return_sequences=1)     print(result)


python app.py executes just fine, but running the same in debug mode (on PyCharm) encounters an error:

Traceback (most recent call last):
  File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/pydevd.py", line 1438, in _exec    pydev_imports.execfile(file, globals, locals)  # execute the script  File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile    exec(compile(contents+"\n", file, 'exec'), glob, loc)  File "/Users/mgb/Desktop/Work/Apteryx_Clients_2/bao/bao-ai/apteryx_apis/patformer/app.py", line 45, in <module>    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_DIR)  File "/Users/mgb/opt/anaconda3/envs/transformers/lib/python3.8/site-packages/transformers/tokenization_utils.py", line 282, in from_pretrained    return cls._from_pretrained(*inputs, **kwargs)  File "/Users/mgb/opt/anaconda3/envs/transformers/lib/python3.8/site-packages/transformers/tokenization_utils.py", line 411, in _from_pretrained    tokenizer = cls(*init_inputs, **init_kwargs)  File "/Users/mgb/opt/anaconda3/envs/transformers/lib/python3.8/site-packages/transformers/tokenization_gpt2.py", line 118, in __init__    super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)  File "/Users/mgb/opt/anaconda3/envs/transformers/lib/python3.8/site-packages/transformers/tokenization_utils.py", line 232, in __init__    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) AssertionError





Reading This Topic