Productivity
TextRL is a Python library that aims to improve text generation using reinforcement learning, building upon Hugging Face's Transformers, PFRL, and OpenAI GYM. TextRL is designed to be easily customizable and can be applied to various text-generation models.
TextRL utilizes reinforcement learning to fine-tune text generation models. It is built upon the following libraries:
gpt2
import pfrl
from textrl import TextRLEnv, TextRLActor, train_agent_with_evaluation
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
checkpoint = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
model = model.cuda()
class MyRLEnv(TextRLEnv):
def get_reward(self, input_item, predicted_list, finish): # predicted will be the list of predicted token
reward = [0]
if finish:
reward = [1] # calculate reward score base on predicted_list
return reward
observaton_list = [{"input":"explain how attention work in seq2seq model"}]
env = TextRLEnv(model, tokenizer, observation_input=observaton_list, max_length=20, compare_sample=2)
actor = TextRLActor(env, model, tokenizer,
act_deterministically=False,
temperature=1.0,
top_k=0,
top_p=1.0,
repetition_penalty=2)
agent = actor.agent_ppo(update_interval=2, minibatch_size=2, epochs=10)
print(actor.predict(observaton_list[0]))
train_agent_with_evaluation(
agent,
env,
steps=100,
eval_n_steps=None,
eval_n_episodes=1,
eval_interval=2,
outdir='bloom—test',
)
print(actor.predict(observaton_list[0]))
flan-t5
colab example: google/flan-t5-base
import pfrl
from textrl import TextRLEnv, TextRLActor, train_agent_with_evaluation
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
model.eval()
model.cuda()
sentiment = pipeline('sentiment-analysis',model="cardiffnlp/twitter-roberta-base-sentiment",tokenizer="cardiffnlp/twitter-roberta-base-sentiment",device=0,return_all_scores=True)
class MyRLEnv(TextRLEnv):
def get_reward(self, input_item, predicted_list, finish): # predicted will be the list of predicted token
reward = 0
if finish or len(predicted_list[0]) >= self.env_max_length:
predicted_text = tokenizer.convert_tokens_to_string(predicted_list[0])
# sentiment classifier
reward = sentiment(input_item['input']+predicted_text)[0][0]['score'] * 10
return reward
observaton_list = [{'input':'i think dogecoin is'}]
env = MyRLEnv(model, tokenizer, observation_input=observaton_list, compare_sample=1)
actor = TextRLActor(env,model,tokenizer,optimizer='adamw',
temperature=0.8,
top_k=100,
top_p=0.85,)
agent = actor.agent_ppo(update_interval=50, minibatch_size=3, epochs=10,lr=3e-4)
print(actor.predict(observaton_list[0]))
pfrl.experiments.train_agent_with_evaluation(
agent,
env,
steps=3000,
eval_n_steps=None,
eval_n_episodes=1,
train_max_episode_len=100,
eval_interval=10,
outdir='checkpoint',
)
agent.load("./checkpoint/best")
print(actor.predict(observaton_list[0]))
bigscience/bloomz-7b1-mt
import pfrl
from textrl import TextRLEnv, TextRLActor, train_agent_with_evaluation
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
model = model.cuda()
class MyRLEnv(TextRLEnv):
def get_reward(self, input_item, predicted_list, finish): # predicted will be the list of predicted token
reward = [0]
if finish:
reward = [1] # calculate reward score base on predicted_list
return reward
observaton_list = [{"input":"explain how attention work in seq2seq model"}]
env = TextRLEnv(model, tokenizer, observation_input=observaton_list, max_length=20, compare_sample=2)
actor = TextRLActor(env, model, tokenizer,
act_deterministically=False,
temperature=1.0,
top_k=0,
top_p=1.0)
agent = actor.agent_ppo(update_interval=2, minibatch_size=2, epochs=10)
print(actor.predict(observaton_list[0]))
train_agent_with_evaluation(
agent,
env,
steps=100,
eval_n_steps=None,
eval_n_episodes=1,
eval_interval=2,
outdir='bloom—test',
)
print(actor.predict(observaton_list[0]))
Strongly recommend contribute on public swarm to increase petals capacity
https://github.com/bigscience-workshop/petals
install pip install petals -U
first
import pfrl
from textrl import TextRLEnv, TextRLActor, train_agent_with_evaluation
from transformers import BloomTokenizerFast
from petals import DistributedBloomForCausalLM
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
MODEL_NAME = "bigscience/bloom-petals"
tokenizer = BloomTokenizerFast.from_pretrained(MODEL_NAME)
model = DistributedBloomForCausalLM.from_pretrained(MODEL_NAME)
model = model.cuda()
class MyRLEnv(TextRLEnv):
def get_reward(self, input_item, predicted_list, finish): # predicted will be the list of predicted token
reward = [0]
if finish:
reward = [1] # calculate reward score base on predicted_list
return reward
observaton_list = [{"input":"explain how attention work in seq2seq model"}]
env = TextRLEnv(model, tokenizer, observation_input=observaton_list, max_length=20, compare_sample=2)
actor = TextRLActor(env, model, tokenizer,
act_deterministically=False,
temperature=1.0,
top_k=0,
top_p=1.0)
agent = actor.agent_ppo(update_interval=2, minibatch_size=2, epochs=10)
print(actor.predict(observaton_list[0]))
train_agent_with_evaluation(
agent,
env,
steps=100,
eval_n_steps=None,
eval_n_episodes=1,
eval_interval=2,
outdir='bloom—test',
)
print(actor.predict(observaton_list[0]))
[Controllable generation via RL to let Elon Musk speak ill of DOGE ](https://github.com/voidful/TextRL/blob/main/example/2022-12-10-textrl-elon-musk.ipynb)
colab example: bigscience/bloom-560m
colab exmaple: huggingtweets/elonmusk
before: i think dogecoin is a great idea.
after: i think dogecoin is a great idea, but I think it is a little overused.
pip install pfrl@git+https://github.com/voidful/pfrl.git
pip install textrl
git clone and cd into this project.
pip install -e .
import torch
from textrl import TextRLEnv, TextRLActor, train_agent_with_evaluation
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "bigscience/bloomz-7b1-mt"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
model = model.cuda()
class MyRLEnv(TextRLEnv):
def get_reward(self, input_item, predicted_list, finish):
if finish:
reward = [0] # calculate reward score based on predicted_list
return reward
observation_list should be a list of all possible input strings for model training
Example: observation_list = [{"input":'testing sent 1'},{"input":'testing sent 2'}]
env = MyRLEnv(model, tokenizer, observation_input=observation_list)
actor = TextRLActor(env, model, tokenizer)
agent = actor.agent_ppo(update_interval=10, minibatch_size=2000, epochs=20)
n_episodes = 1000
max_episode_len = 200 # max sentence length
for i in range(1, n_episodes + 1):
obs = env.reset()
R = 0
t = 0
while True:
action = agent.act(obs)
obs, reward, done, pred = env.step(action)
R += reward
t += 1
reset = t == max_episode_len
agent.observe(obs, reward, done, reset)
if done or reset:
break
if i % 10 == 0:
print('episode:', i, 'R:', R)
if i % 50 == 0:
print('statistics:', agent.get_statistics())
print('Finished.')
Another way to train:
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
train_agent_with_evaluation(
agent,
env,
steps=1000,
eval_n_steps=None,
eval_n_episodes=1500,
train_max_episode_len=50,
eval_interval=10000,
outdir='somewhere',
)
agent.load("somewhere/best") # loading the best model
actor.predict("input text")
This updated usage section provides a comprehensive guide on how to initialize the agent and environment, set up the reward function for the environment, prepare for training, train the model, and make predictions. It also includes an alternative way to train the model using the train_agent_with_evaluation
function.
textrl-dump --model ./model_path_before_rl --rl ./rl_path --dump ./output_dir
To finetune a language model using RL, you need to modify the reward function:
from textrl import TextRLEnv
class MyRLEnv(TextRLEnv):
def get_reward(self, input_item, predicted_list, finish):
# input_item is the prompt input for the model, it will be one of your observation
# an observation will be a list of sentence of eg: ['inputted sentence','xxx','yyy']
# only the first input will feed to the model 'inputted sentence', and
# the remaining can be the reference for reward calculation
# predicted_list is the list of predicted sentences of RL model generated,
# it will be used for ranking reward calculation
# finish is the end of sentences flags, get_reward will be called during generating each word, and
# when finish is True, it means the sentence is finished, it will use for sentence level reward calculation.
# reward should be the list equal to the length of predicted_list
return reward
Parameters for sampling diverse examples:
actor = TextRLActor(env, model, tokenizer,
act_deterministically=False, # select the max probability token for each step or not
temperature=1, # temperature for sampling
compare_sample=2, # num of sample to rank
top_k=0, # top k sampling
top_p=1.0,) # top p sampling
When training a reinforcement learning (RL) model, several key parameters need to be tuned to ensure optimal performance. Here is a list of important parameters and their descriptions:
update_interval=10
minibatch_size=2000
epochs=20
gamma=0.99
lr=1e-4
epsilon=0.2
entropy_coef=0.01
steps=1000
eval_interval=10000
train_max_episode_len=50
These parameters need to be carefully tuned based on the specific problem and environment to achieve the best performance. It is generally recommended to start with default values and then adjust them based on the observed learning behavior.