Sungwoo Kim
/
HydraulicControlBoard_Rainbow_v1_2_copy1
2011
Diff: main.cpp
- Revision:
- 173:68c7914679ec
- Parent:
- 172:63af34265fe9
- Child:
- 174:c828479f53f9
diff -r 63af34265fe9 -r 68c7914679ec main.cpp --- a/main.cpp Sat Nov 21 07:25:32 2020 +0000 +++ b/main.cpp Tue Nov 24 05:19:59 2020 +0000 @@ -1,4 +1,4 @@ -//201121_2 +//201124_1 #include "mbed.h" #include "FastPWM.h" #include "INIT_HW.h" @@ -268,28 +268,36 @@ float input_RL[num_input_RL] = { 0.0f }; //Critic Networks -float hc1[num_input_RL][num_hidden_unit] = {0.0f}; -float bc1[num_hidden_unit] = {0.0f}; -float hc2[num_hidden_unit] = {0.0f}; -float bc2 = 0.0f; +float hc1[num_input_RL][num_hidden_unit1] = {0.0f}; +float bc1[num_hidden_unit1] = {0.0f}; +float hc2[num_hidden_unit1][num_hidden_unit2] = {0.0f}; +float bc2[num_hidden_unit2] = {0.0f}; +float hc3[num_hidden_unit2] = {0.0f}; +float bc3 = 0.0f; //Critic Networks Temporary -float hc1_temp[num_input_RL][num_hidden_unit] = {0.0f}; -float bc1_temp[num_hidden_unit] = {0.0f}; -float hc2_temp[num_hidden_unit] = {0.0f}; -float bc2_temp = 0.0f; +float hc1_temp[num_input_RL][num_hidden_unit1] = {0.0f}; +float bc1_temp[num_hidden_unit1] = {0.0f}; +float hc2_temp[num_hidden_unit1][num_hidden_unit2] = {0.0f}; +float bc2_temp[num_hidden_unit2] = {0.0f}; +float hc3_temp[num_hidden_unit2] = {0.0f}; +float bc3_temp = 0.0f; //Actor Networks -float ha1[num_input_RL][num_hidden_unit] = {0.0f}; -float ba1[num_hidden_unit] = {0.0f}; -float ha2[num_hidden_unit][2] = {0.0f}; -float ba2[2] = {0.0f}; +float ha1[num_input_RL][num_hidden_unit1] = {0.0f}; +float ba1[num_hidden_unit1] = {0.0f}; +float ha2[num_hidden_unit1][num_hidden_unit2] = {0.0f}; +float ba2[num_hidden_unit2] = {0.0f}; +float ha3[num_hidden_unit2][2] = {0.0f}; +float ba3[2] = {0.0f}; //Actor Networks Temporary -float ha1_temp[num_input_RL][num_hidden_unit] = {0.0f}; -float ba1_temp[num_hidden_unit] = {0.0f}; -float ha2_temp[num_hidden_unit][2] = {0.0f}; -float ba2_temp[2] = {0.0f}; +float ha1_temp[num_input_RL][num_hidden_unit1] = {0.0f}; +float ba1_temp[num_hidden_unit1] = {0.0f}; +float ha2_temp[num_hidden_unit1][num_hidden_unit2] = {0.0f}; +float ba2_temp[num_hidden_unit2] = {0.0f}; +float ha3_temp[num_hidden_unit2][2] = {0.0f}; +float ba3_temp[2] = {0.0f}; float VALVE_POS_RAW_NN = 0.0f; float DDV_JOINT_POS_FF(float REF_JOINT_VEL); @@ -297,38 +305,82 @@ float Critic_Network(float *arr) { - float output1[num_hidden_unit] = { 0.0f }; + float output1[num_hidden_unit1] = { 0.0f }; + float output2[num_hidden_unit2] = { 0.0f }; float output = 0.0f; - for (int index2 = 0; index2 < num_hidden_unit; index2++) { + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { for (int index1 = 0; index1 < num_input_RL; index1++) { output1[index2] = output1[index2] + hc1[index1][index2] * arr[index1]; } - output1[index2] = tanh(output1[index2] + bc1[index2]); + //ReLU + output1[index2] = output1[index2] + bc1[index2]; + hx_c_sum[index2] = output1[index2]; + if (output1[index2] < 0) { + output1[index2] = 0; + } + //tanh + //output1[index2] = tanh(output1[index2] + bc1[index2]); + } + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + output2[index2] = output2[index2] + hc2[index1][index2] * arr[index1]; + } + //ReLU + output2[index2] = output2[index2] + bc2[index2]; + hxh_c_sum[index2] = output2[index2]; + if (output2[index2] < 0) { + output2[index2] = 0; + } + //tanh + //output2[index2] = tanh(output2[index2] + bc2[index2]); } for (int index2 = 0; index2 < 1; index2++) { - for (int index1 = 0; index1 < num_hidden_unit; index1++) { - output = output + hc2[index1] * output1[index1]; + for (int index1 = 0; index1 < num_hidden_unit2; index1++) { + output = output + hc3[index1] * output2[index1]; } - output = output + bc2; + output = output + bc3; + hxhh_c_sum = output; } return output; } float Critic_Network_Temp(float *arr) { - float output1[num_hidden_unit] = { 0.0f }; + float output1[num_hidden_unit1] = { 0.0f }; + float output2[num_hidden_unit2] = { 0.0f }; float output = 0.0f; - for (int index2 = 0; index2 < num_hidden_unit; index2++) { + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { for (int index1 = 0; index1 < num_input_RL; index1++) { output1[index2] = output1[index2] + hc1_temp[index1][index2] * arr[index1]; } - output1[index2] = tanh(output1[index2] + bc1_temp[index2]); + //ReLU + output1[index2] = output1[index2] + bc1_temp[index2]; + hx_c_sum[index2] = output1[index2]; + if (output1[index2] < 0) { + output1[index2] = 0; + } + //tanh + //output1[index2] = tanh(output1[index2] + bc1_temp[index2]); + } + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + output2[index2] = output2[index2] + hc2_temp[index1][index2] * arr[index1]; + } + //ReLU + output2[index2] = output2[index2] + bc2_temp[index2]; + hxh_c_sum[index2] = output2[index2]; + if (output2[index2] < 0) { + output2[index2] = 0; + } + //tanh + //output2[index2] = tanh(output2[index2] + bc2_temp[index2]); } for (int index2 = 0; index2 < 1; index2++) { - for (int index1 = 0; index1 < num_hidden_unit; index1++) { - output = output + hc2_temp[index1] * output1[index1]; + for (int index1 = 0; index1 < num_hidden_unit2; index1++) { + output = output + hc3_temp[index1] * output2[index1]; } - output = output + bc2_temp; + output = output + bc3_temp; + hxhh_c_sum = output; } return output; } @@ -336,10 +388,53 @@ void Actor_Network(float *arr) { - float output1[num_hidden_unit] = {0.0f}; + float output1[num_hidden_unit1] = {0.0f}; + float output2[num_hidden_unit2] = {0.0f}; float output[2] = {0.0f}; - for (int index2 = 0; index2 < num_hidden_unit; index2++) { + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { + for (int index1 = 0; index1 < num_input_RL; index1++) { + output1[index2] = output1[index2] + ha1[index1][index2] * arr[index1]; + } + output1[index2] = output1[index2] + ba1[index2]; + hx_a_sum[index2] = output1[index2]; + if (output1[index2] < 0) { + output1[index2] = 0; + } + } + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + output2[index2] = output2[index2] + ha2[index1][index2] * arr[index1]; + } + output2[index2] = output2[index2] + ba2[index2]; + hxh_a_sum[index2] = output2[index2]; + if (output2[index2] < 0) { + output2[index2] = 0; + } + } + for (int index2 = 0; index2 < 2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit2; index1++) { + output[index2] = output[index2] + ha3[index1][index2] * output2[index1]; + } + } + hxhh_a_sum[0] = output[0] + ba3[0]; + hxhh_a_sum[1] = output[1] + ba3[1]; + + mean_before_SP = output[0] + ba3[0]; //SP = softplus + deviation_before_SP = output[1] + ba3[1]; + //Softplus + mean = log(1.0f+exp(mean_before_SP)); + deviation = log(1.0f+exp(deviation_before_SP)); +} + + +void Actor_Network_Old(float *arr) +{ + float output1[num_hidden_unit1] = {0.0f}; + float output2[num_hidden_unit2] = {0.0f}; + float output[2] = {0.0f}; + + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { for (int index1 = 0; index1 < num_input_RL; index1++) { output1[index2] = output1[index2] + ha1[index1][index2] * arr[index1]; } @@ -348,62 +443,25 @@ output1[index2] = 0; } } - for (int index2 = 0; index2 < 2; index2++) { - for (int index1 = 0; index1 < num_hidden_unit; index1++) { - output[index2] = output[index2] + ha2[index1][index2] * output1[index1]; + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + output2[index2] = output2[index2] + ha2[index1][index2] * arr[index1]; } - } - mean_before_SP = output[0] + ba2[0]; //SP = softplus - deviation_before_SP = output[1] + ba2[1]; -// mean = log(1.0f+exp(mean_before_SP)); -// deviation = log(1.0f+exp(deviation_before_SP)); - if (mean_before_SP >=0) { - mean = mean_before_SP; - } else { - mean = 0.0f; - } - if (deviation_before_SP >=0) { - deviation = deviation_before_SP; - } else { - deviation = 0.0f; - } - -} - - -void Actor_Network_Old(float *arr) -{ - float output1[num_hidden_unit] = {0.0f}; - float output[2] = {0.0f}; - - for (int index2 = 0; index2 < num_hidden_unit; index2++) { - for (int index1 = 0; index1 < num_input_RL; index1++) { - output1[index2] = output1[index2] + ha1[index1][index2] * arr[index1]; - } - output1[index2] = output1[index2] + ba1[index2]; - if (output1[index2] < 0) { - output1[index2] = 0; + output2[index2] = output2[index2] + ba2[index2]; + if (output2[index2] < 0) { + output2[index2] = 0; } } for (int index2 = 0; index2 < 2; index2++) { - for (int index1 = 0; index1 < num_hidden_unit; index1++) { - output[index2] = output[index2] + ha2[index1][index2] * output1[index1]; + for (int index1 = 0; index1 < num_hidden_unit2; index1++) { + output[index2] = output[index2] + ha3[index1][index2] * output2[index1]; } } - mean_old = output[0] + ba2[0]; - deviation_old = output[1] + ba2[1]; -// mean_old = log(1.0f+exp(mean_old)); -// deviation_old = log(1.0f+exp(deviation_old)); - if (mean_before_SP >=0) { - mean_old = mean_before_SP; - } else { - mean_old = 0.0f; - } - if (deviation_before_SP >=0) { - deviation_old = deviation_before_SP; - } else { - deviation_old = 0.0f; - } + mean_old = output[0] + ba3[0]; + deviation_old = output[1] + ba3[1]; + //Softplus + mean_old = log(1.0f+exp(mean_old)); + deviation_old = log(1.0f+exp(deviation_old)); } float Grad_Normal_Dist_Mean(float mean, float deviation, float action) @@ -420,221 +478,430 @@ return grad_dev; } +float ReLU(float x) +{ + if (x >= 0) { + return x; + } else { + return 0.0f; + } +} + void update_Critic_Networks(float (*arr)[num_input_RL]) { float gradient_rate = 0.001f; - - float G_hc1[num_input_RL][num_hidden_unit] = {0.0f}; - float G_bc1[num_hidden_unit] = {0.0f}; - for (int index2 = 0; index2 < num_hidden_unit; index2++) { +// float hx_sum = 0.0f; + + + ///////////////////////////////////////////////////////////CRITIC + float G_hc1[num_input_RL][num_hidden_unit1] = {0.0f}; + float G_bc1[num_hidden_unit1] = {0.0f}; + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { for (int index1 = 0; index1 < num_input_RL; index1++) { - for (int i=0; i<batch_size; i++) { - float hx_sum = 0.0f; - float hx_sum_next = 0.0f; - for(int j=0; j<num_input_RL; j++) { - hx_sum = hx_sum + hc1_temp[j][index2]*arr[i][j]; - if (i==batch_size-1) hx_sum_next = 0.0f; - else hx_sum_next = hx_sum_next + hc1_temp[j][index2]*arr[i+1][j]; + for (int n=0; n<batch_size; n++) { + for(int k=0; k<num_hidden_unit2; k++) { + if (hxh_c_sum_array[n][k] >= 0) { + if (hx_c_sum_array[n][index2] > 0) { + G_hc1[index1][index2] = G_hc1[index1][index2] + arr[n][index1]*hc2_temp[index2][k]*hc3_temp[k]; + } + } } - if (i==batch_size-1) G_hc1[index1][index2] = G_hc1[index1][index2] + 2.0f*advantage[i]*(-hc2_temp[index2]*(1.0f-tanh(hx_sum + bc1_temp[index2])*tanh(hx_sum + bc1_temp[index2]))*arr[i][index1]); - else G_hc1[index1][index2] = G_hc1[index1][index2] + 2.0f*advantage[i]*(hc2_temp[index2]*(1.0f-tanh(hx_sum_next + bc1_temp[index2])*tanh(hx_sum_next + bc1_temp[index2]))*arr[i+1][index1] - hc2_temp[index2]*(1.0f-tanh(hx_sum + bc1_temp[index2])*tanh(hx_sum + bc1_temp[index2]))*arr[i][index1]); } G_hc1[index1][index2] = G_hc1[index1][index2] / batch_size; + //hc1_temp[index1][index2] = hc1_temp[index1][index2] - gradient_rate * G_hc1[index1][index2]; + } + for (int n=0; n<batch_size; n++) { + for(int k=0; k<num_hidden_unit2; k++) { + if (hxh_c_sum_array[n][k] >= 0) { + if (hx_c_sum_array[n][index2] > 0) { + G_bc1[index2] = G_bc1[index2] + hc2_temp[index2][k]*hc3_temp[k]; + } + + } + } + } + G_bc1[index2] = G_bc1[index2] / batch_size; + //bc1_temp[index2] = bc1_temp[index2] - gradient_rate * G_bc1[index2]; + } + + float G_hc2[num_hidden_unit1][num_hidden_unit2] = {0.0f}; + float G_bc2[num_hidden_unit2] = {0.0f}; + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + for (int n=0; n<batch_size; n++) { + if (hxh_c_sum_array[n][index2] >= 0) { + if (hx_c_sum_array[n][index1] > 0) { + G_hc2[index1][index2] = G_hc2[index1][index2] + hx_c_sum_array[n][index1]*hc3_temp[index2]; + } + } + } + G_hc2[index1][index2] = G_hc2[index1][index2] / batch_size; + //hc2_temp[index1][index2] = hc2_temp[index1][index2] - gradient_rate * G_hc2[index1][index2]; + } + for (int n=0; n<batch_size; n++) { + if (hxh_c_sum_array[n][index2] >= 0) { + G_bc2[index2] = G_bc2[index2] + hc3_temp[index2]; + } + } + G_bc2[index2] = G_bc2[index2] / batch_size; + //bc2_temp[index2] = bc2_temp[index2] - gradient_rate * G_bc2[index2]; + } + + float G_hc3[num_hidden_unit2]= {0.0f}; + float G_bc3 = 0.0f; + for (int index2 = 0; index2 < 1; index2++) { + for (int index1 = 0; index1 < num_hidden_unit2; index1++) { + for (int n=0; n<batch_size; n++) { + if (hxh_c_sum_array[n][index1] >= 0) { + G_hc3[index1] = G_hc3[index1] + hxh_c_sum_array[n][index1]; + } + } + G_hc3[index1] = G_hc3[index1] / batch_size; + //hc3_temp[index1] = hc3_temp[index1] - gradient_rate * G_hc3[index1]; + } + for (int n=0; n<batch_size; n++) { + G_bc2[index2] = G_bc2[index2] + 1.0f; + } + G_bc3 = G_bc3 / batch_size; + //bc3_temp = bc3_temp - gradient_rate * G_bc3; + } + + // Simultaneous Update + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { + for (int index1 = 0; index1 < num_input_RL; index1++) { hc1_temp[index1][index2] = hc1_temp[index1][index2] - gradient_rate * G_hc1[index1][index2]; } - for (int i=0; i<batch_size; i++) { - float hx_sum = 0.0f; - float hx_sum_next = 0.0f; - for(int j=0; j<num_input_RL; j++) { - hx_sum = hx_sum + hc1_temp[j][index2]*arr[i][j]; - if (i==batch_size-1) hx_sum_next = 0.0f; - else hx_sum_next = hx_sum_next + hc1_temp[j][index2]*arr[i+1][j]; - } - if (i==batch_size-1) G_bc1[index2] = G_bc1[index2] + 2.0f*advantage[i]*(-hc2_temp[index2]*(1.0f-tanh(hx_sum + bc1_temp[index2])*tanh(hx_sum + bc1_temp[index2]))); - else G_bc1[index2] = G_bc1[index2] + 2.0f*advantage[i]*(hc2_temp[index2]*(1.0f-tanh(hx_sum_next + bc1_temp[index2])*tanh(hx_sum_next + bc1_temp[index2])) - hc2_temp[index2]*(1.0f-tanh(hx_sum + bc1_temp[index2])*tanh(hx_sum + bc1_temp[index2]))); - } - G_bc1[index2] = G_bc1[index2] / batch_size; bc1_temp[index2] = bc1_temp[index2] - gradient_rate * G_bc1[index2]; } - - float G_hc2[num_hidden_unit] = {0.0f}; - float G_bc2 = 0.0f; + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + hc2_temp[index1][index2] = hc2_temp[index1][index2] - gradient_rate * G_hc2[index1][index2]; + } + bc2_temp[index2] = bc2_temp[index2] - gradient_rate * G_bc2[index2]; + } for (int index2 = 0; index2 < 1; index2++) { - for (int index1 = 0; index1 < num_hidden_unit; index1++) { - for (int i=0; i<batch_size; i++) { - float hx_sum = 0.0f; - float hx_sum_next = 0.0f; - for(int j=0; j<num_input_RL; j++) { - hx_sum = hx_sum + hc1_temp[j][index1]*arr[i][j]; - if (i==batch_size-1) hx_sum_next = 0.0f; - else hx_sum_next = hx_sum_next + hc1_temp[j][index1]*arr[i+1][j]; - } - if (i==batch_size-1) G_hc2[index1] = G_hc2[index1] - 2.0f*advantage[i]*tanh(hx_sum + bc1_temp[index1]); - else G_hc2[index1] = G_hc2[index1] + 2.0f*advantage[i]*(tanh(hx_sum_next + bc1_temp[index1]) - tanh(hx_sum + bc1_temp[index1])); - } - G_hc2[index1] = G_hc2[index1] / batch_size; - hc2_temp[index1] = hc2_temp[index1] - gradient_rate * G_hc2[index1]; + for (int index1 = 0; index1 < num_hidden_unit2; index1++) { + hc3_temp[index1] = hc3_temp[index1] - gradient_rate * G_hc3[index1]; } - for (int i=0; i<batch_size; i++) { - if (i==batch_size-1) G_bc2 = G_bc2 + 2.0f*advantage[i]*(-1.0f); - else G_bc2 = 0.0f; - } - G_bc2 = G_bc2/ batch_size; - bc2_temp = bc2_temp - gradient_rate * G_bc2; + bc3_temp = bc3_temp - gradient_rate * G_bc3; } + } +///////////////////////////Softplus////////////////////////////////// void update_Actor_Networks(float (*arr)[num_input_RL]) { - float gradient_rate = 0.001f; //-0.01f - - float G_ha1[num_input_RL][num_hidden_unit] = {0.0f}; - float G_ba1[num_hidden_unit] = {0.0f}; - for (int index2 = 0; index2 < num_hidden_unit; index2++) { + float gradient_rate = 0.001f; + + float G_ha1[num_input_RL][num_hidden_unit1] = {0.0f}; + float G_ba1[num_hidden_unit1] = {0.0f}; + float d_x_d_ha1[num_input_RL][num_hidden_unit1] = {0.0f}; + float d_x_d_ba1[num_hidden_unit1] = {0.0f}; + float d_y_d_ha1[num_input_RL][num_hidden_unit1] = {0.0f}; + float d_y_d_ba1[num_hidden_unit1] = {0.0f}; + + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { for (int index1 = 0; index1 < num_input_RL; index1++) { - for (int i=0; i<batch_size; i++) { - if((advantage[i] >= 0.0f && ratio[i] >= 1.0f + epsilon) || (advantage[i] < 0.0f && ratio[i] < 1.0f - epsilon)) { + for (int n=0; n<batch_size; n++) { + if((advantage[n] >= 0.0f && ratio[n] >= 1.0f + epsilon) || (advantage[n] < 0.0f && ratio[n] < 1.0f - epsilon)) { G_ha1[index1][index2] = G_ha1[index1][index2]; } else { - - float hx_sum_total = 0.0f; - for(int m = 0; m < num_hidden_unit; m++) { - for(int n = 0; n < num_input_RL; n++) { - hx_sum_total = hx_sum_total + ha1_temp[n][m]*arr[i][n]; + for(int k=0; k<num_hidden_unit2; k++) { + if (hxh_a_sum_array[n][k] >= 0) { + if (hx_a_sum_array[n][index2] > 0) { + d_x_d_ha1[index1][index2] = d_x_d_ha1[index1][index2] + arr[n][index1]*ha2_temp[index2][k]*ha3_temp[k][0]; + d_y_d_ha1[index1][index2] = d_y_d_ha1[index1][index2] + arr[n][index1]*ha2_temp[index2][k]*ha3_temp[k][1]; + } } } - hx_sum_total = hx_sum_total + bc1_temp[index2]; float d_mean_d_ha1 = 0.0f; float d_dev_d_ha1 = 0.0f; - if (hx_sum_total >=0) { - float hx_sum = 0.0f; - for(int j=0; j<num_input_RL; j++) { - hx_sum = hx_sum + ha1_temp[j][index2]*arr[i][j]; - } - hx_sum = hx_sum + bc1_temp[index2]; - if (hx_sum >= 0) { -// d_mean_d_ha1 = exp(mean_before_SP_array[i])/(1.0f+exp(mean_before_SP_array[i]))*ha2_temp[index2][0]*arr[i][index1]; -// d_dev_d_ha1 = exp(deviation_before_SP_array[i])/(1.0f+exp(deviation_before_SP_array[i]))*ha2_temp[index2][1]*arr[i][index1]; - d_mean_d_ha1 = ha2_temp[index2][0]*arr[i][index1]; - d_dev_d_ha1 = ha2_temp[index2][1]*arr[i][index1]; - } else { - d_mean_d_ha1 = 0.0f; - d_dev_d_ha1 = 0.0f; - } - } else { - d_mean_d_ha1 = 0.0f; - d_dev_d_ha1 = 0.0f; - } - G_ha1[index1][index2] = G_ha1[index1][index2] + advantage[i]/pi_old[i]*(d_mean_d_ha1*Grad_Normal_Dist_Mean(mean_array[i],deviation_array[i],action_array[i])+d_dev_d_ha1*Grad_Normal_Dist_Deviation(mean_array[i],deviation_array[i],action_array[i])); + d_mean_d_ha1 = exp(hxhh_a_sum_array[n][0])/(1.0f+exp(hxhh_a_sum_array[n][0]))*d_x_d_ha1[index1][index2]; + d_dev_d_ha1 = exp(hxhh_a_sum_array[n][1])/(1.0f+exp(hxhh_a_sum_array[n][1]))*d_y_d_ha1[index1][index2]; + + G_ha1[index1][index2] = G_ha1[index1][index2] + advantage[n]/pi_old[n]*(d_mean_d_ha1*Grad_Normal_Dist_Mean(mean_array[n],deviation_array[n],action_array[n])+d_dev_d_ha1*Grad_Normal_Dist_Deviation(mean_array[n],deviation_array[n],action_array[n])); } } G_ha1[index1][index2] = G_ha1[index1][index2] / batch_size; ha1_temp[index1][index2] = ha1_temp[index1][index2] - gradient_rate * G_ha1[index1][index2]; } - for (int i=0; i<batch_size; i++) { - if((advantage[i] >= 0.0f && ratio[i] >= 1.0f + epsilon) || (advantage[i] < 0.0f && ratio[i] < 1.0f - epsilon)) { + + for (int n=0; n<batch_size; n++) { + if((advantage[n] >= 0.0f && ratio[n] >= 1.0f + epsilon) || (advantage[n] < 0.0f && ratio[n] < 1.0f - epsilon)) { G_ba1[index2] = G_ba1[index2]; } else { - - float hx_sum_total = 0.0f; - for(int m = 0; m < num_hidden_unit; m++) { - for(int n = 0; n < num_input_RL; n++) { - hx_sum_total = hx_sum_total + ha1_temp[n][m]*arr[i][n]; + for(int k=0; k<num_hidden_unit2; k++) { + if (hxh_a_sum_array[n][k] >= 0) { + if (hx_a_sum_array[n][index2] > 0) { + d_x_d_ba1[index2] = d_x_d_ba1[index2] + ha2_temp[index2][k]*ha3_temp[k][0]; + d_y_d_ba1[index2] = d_y_d_ba1[index2] + ha2_temp[index2][k]*ha3_temp[k][1]; + } } } - hx_sum_total = hx_sum_total + bc1_temp[index2]; float d_mean_d_ba1 = 0.0f; float d_dev_d_ba1 = 0.0f; - if (hx_sum_total >=0) { - - float hx_sum = 0.0f; - for(int j=0; j<num_input_RL; j++) { - hx_sum = hx_sum + ha1_temp[j][index2]*arr[i][j]; - } - hx_sum = hx_sum + bc1_temp[index2]; - - if(hx_sum >=0) { -// d_mean_d_ba1 = exp(mean_before_SP_array[i])/(1.0f+exp(mean_before_SP_array[i]))*ha2_temp[index2][0]; -// d_dev_d_ba1 = exp(deviation_before_SP_array[i])/(1.0f+exp(deviation_before_SP_array[i]))*ha2_temp[index2][1]; - d_mean_d_ba1 = ha2_temp[index2][0]; - d_dev_d_ba1 = ha2_temp[index2][1]; - } else { - d_mean_d_ba1 = 0.0f; - d_dev_d_ba1 = 0.0f; - } - } else { - d_mean_d_ba1 = 0.0f; - d_dev_d_ba1 = 0.0f; - } - G_ba1[index2] = G_ba1[index2] + advantage[i]/pi_old[i]*(d_mean_d_ba1*Grad_Normal_Dist_Mean(mean_array[i],deviation_array[i],action_array[i])+d_dev_d_ba1*Grad_Normal_Dist_Deviation(mean_array[i],deviation_array[i],action_array[i])); + d_mean_d_ba1 = exp(hxhh_a_sum_array[n][0])/(1.0f+exp(hxhh_a_sum_array[n][0]))*d_x_d_ba1[index2]; + d_dev_d_ba1 = exp(hxhh_a_sum_array[n][1])/(1.0f+exp(hxhh_a_sum_array[n][1]))*d_y_d_ba1[index2]; + + G_ba1[index2] = G_ba1[index2] + advantage[n]/pi_old[n]*(d_mean_d_ba1*Grad_Normal_Dist_Mean(mean_array[n],deviation_array[n],action_array[n])+d_dev_d_ba1*Grad_Normal_Dist_Deviation(mean_array[n],deviation_array[n],action_array[n])); } } G_ba1[index2] = G_ba1[index2] / batch_size; ba1_temp[index2] = ba1_temp[index2] - gradient_rate * G_ba1[index2]; } - float G_ha2[num_hidden_unit][2] = {0.0f}; - float G_ba2[2] = {0.0f}; - for (int index2 = 0; index2 < 2; index2++) { - for (int index1 = 0; index1 < num_hidden_unit; index1++) { - for (int i=0; i<batch_size; i++) { - if((advantage[i] >= 0.0f && ratio[i] >= 1.0f + epsilon) || (advantage[i] < 0.0f && ratio[i] < 1.0f - epsilon)) { + float G_ha2[num_hidden_unit1][num_hidden_unit2] = {0.0f}; + float G_ba2[num_hidden_unit2] = {0.0f}; + float d_x_d_ha2[num_hidden_unit1][num_hidden_unit2] = {0.0f}; + float d_x_d_ba2[num_hidden_unit2] = {0.0f}; + float d_y_d_ha2[num_hidden_unit1][num_hidden_unit2] = {0.0f}; + float d_y_d_ba2[num_hidden_unit2] = {0.0f}; + + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + for (int n=0; n<batch_size; n++) { + if((advantage[n] >= 0.0f && ratio[n] >= 1.0f + epsilon) || (advantage[n] < 0.0f && ratio[n] < 1.0f - epsilon)) { G_ha2[index1][index2] = G_ha2[index1][index2]; } else { - float hx_sum_total = 0.0f; - for(int m = 0; m < num_hidden_unit; m++) { - for(int n = 0; n < num_input_RL; n++) { - hx_sum_total = hx_sum_total + ha1_temp[n][m]*arr[i][n]; + if (hxh_a_sum_array[n][index2] >= 0) { + if (hx_a_sum_array[n][index1] > 0) { + d_x_d_ha2[index1][index2] = d_x_d_ha2[index1][index2] + hx_a_sum_array[n][index1]*ha3_temp[index2][0]; + d_y_d_ha2[index1][index2] = d_y_d_ha2[index1][index2] + hx_a_sum_array[n][index1]*ha3_temp[index2][1]; } } - hx_sum_total = hx_sum_total + bc1_temp[index2]; + float d_mean_d_ha2 = 0.0f; float d_dev_d_ha2 = 0.0f; - if (hx_sum_total >=0) { - float hx_sum = 0.0f; - for(int j=0; j<num_input_RL; j++) { - hx_sum = hx_sum + ha1_temp[j][index1]*arr[i][j]; - } - hx_sum = hx_sum + bc1_temp[index1]; - if (hx_sum >= 0) { -// d_mean_d_ha2 = exp(mean_before_SP_array[i])/(1.0f+exp(mean_before_SP_array[i]))*hx_sum; -// d_dev_d_ha2 = exp(deviation_before_SP_array[i])/(1.0f+exp(deviation_before_SP_array[i]))*hx_sum; - d_mean_d_ha2 = hx_sum; - d_dev_d_ha2 = hx_sum; - } else { - d_mean_d_ha2 = 0.0f; - d_mean_d_ha2 = 0.0f; - } - } else { - d_mean_d_ha2 = 0.0f; - d_mean_d_ha2 = 0.0f; - } - G_ha2[index1][index2] = G_ha2[index1][index2] + advantage[i]/pi_old[i]*(d_mean_d_ha2*Grad_Normal_Dist_Mean(mean_array[i],deviation_array[i],action_array[i])+d_dev_d_ha2*Grad_Normal_Dist_Deviation(mean_array[i],deviation_array[i],action_array[i])); + d_mean_d_ha2 = exp(hxhh_a_sum_array[n][0])/(1.0f+exp(hxhh_a_sum_array[n][0]))*d_x_d_ha2[index1][index2]; + d_dev_d_ha2 = exp(hxhh_a_sum_array[n][1])/(1.0f+exp(hxhh_a_sum_array[n][1]))*d_y_d_ha2[index1][index2]; + + G_ha2[index1][index2] = G_ha2[index1][index2] + advantage[n]/pi_old[n]*(d_mean_d_ha2*Grad_Normal_Dist_Mean(mean_array[n],deviation_array[n],action_array[n])+d_dev_d_ha2*Grad_Normal_Dist_Deviation(mean_array[n],deviation_array[n],action_array[n])); } } G_ha2[index1][index2] = G_ha2[index1][index2] / batch_size; ha2_temp[index1][index2] = ha2_temp[index1][index2] - gradient_rate * G_ha2[index1][index2]; } - for (int i=0; i<batch_size; i++) { - if((advantage[i] >= 0.0f && ratio[i] >= 1.0f + epsilon) || (advantage[i] < 0.0f && ratio[i] < 1.0f - epsilon)) { + + for (int n=0; n<batch_size; n++) { + if((advantage[n] >= 0.0f && ratio[n] >= 1.0f + epsilon) || (advantage[n] < 0.0f && ratio[n] < 1.0f - epsilon)) { G_ba2[index2] = G_ba2[index2]; } else { - float d_mean_d_ba2 = 0.0f; - float d_dev_d_ba2 = 0.0f; -// d_mean_d_ba2 = exp(mean_before_SP_array[i])/(1.0f+exp(mean_before_SP_array[i])); -// d_dev_d_ba2 = exp(deviation_before_SP_array[i])/(1.0f+exp(deviation_before_SP_array[i])); - d_mean_d_ba2 = 1.0f; - d_dev_d_ba2 = 1.0f; - G_ba1[index2] = G_ba1[index2] + advantage[i]/pi_old[i]*(d_mean_d_ba2*Grad_Normal_Dist_Mean(mean_array[i],deviation_array[i],action_array[i])+d_dev_d_ba2*Grad_Normal_Dist_Deviation(mean_array[i],deviation_array[i],action_array[i])); + if (hxh_a_sum_array[n][index2] >= 0) { + d_x_d_ba2[index2] = d_x_d_ba2[index2] + ha3_temp[index2][0]; + d_y_d_ba2[index2] = d_y_d_ba2[index2] + ha3_temp[index2][1]; + } + float d_mean_d_ba2= 0.0f; + float d_dev_d_ba2= 0.0f; + d_mean_d_ba2 = exp(hxhh_a_sum_array[n][0])/(1.0f+exp(hxhh_a_sum_array[n][0]))*d_x_d_ba2[index2]; + d_dev_d_ba2 = exp(hxhh_a_sum_array[n][1])/(1.0f+exp(hxhh_a_sum_array[n][1]))*d_y_d_ba2[index2]; + + G_ba2[index2] = G_ba2[index2] + advantage[n]/pi_old[n]*(d_mean_d_ba2*Grad_Normal_Dist_Mean(mean_array[n],deviation_array[n],action_array[n])+d_dev_d_ba2*Grad_Normal_Dist_Deviation(mean_array[n],deviation_array[n],action_array[n])); } } G_ba2[index2] = G_ba2[index2] / batch_size; ba2_temp[index2] = ba2_temp[index2] - gradient_rate * G_ba2[index2]; } + + float G_ha3[num_hidden_unit2][2] = {0.0f}; + float G_ba3[2] = {0.0f}; + float d_x_d_ha3[num_hidden_unit2][2] = {0.0f}; + float d_x_d_ba3[2] = {0.0f}; + float d_y_d_ha3[num_hidden_unit2][2] = {0.0f}; + float d_y_d_ba3[2] = {0.0f}; + + for (int index2 = 0; index2 < 2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit2; index1++) { + for (int n=0; n<batch_size; n++) { + if((advantage[n] >= 0.0f && ratio[n] >= 1.0f + epsilon) || (advantage[n] < 0.0f && ratio[n] < 1.0f - epsilon)) { + G_ha3[index1][index2] = G_ha3[index1][index2]; + } else { + + if (hxh_a_sum_array[n][index1] >= 0) { + if (hx_a_sum_array[n][index1] > 0) { + d_x_d_ha3[index1][index2] = d_x_d_ha3[index1][index2] + hxh_a_sum_array[n][index1]; + d_y_d_ha3[index1][index2] = d_y_d_ha3[index1][index2] + hxh_a_sum_array[n][index1]; + + } + } + float d_mean_d_ha3 = 0.0f; + float d_dev_d_ha3 = 0.0f; + d_mean_d_ha3 = exp(hxhh_a_sum_array[n][0])/(1.0f+exp(hxhh_a_sum_array[n][0]))*d_x_d_ha3[index1][index2]; + d_dev_d_ha3 = exp(hxhh_a_sum_array[n][1])/(1.0f+exp(hxhh_a_sum_array[n][1]))*d_y_d_ha3[index1][index2]; + + G_ha3[index1][index2] = G_ha3[index1][index2] + advantage[n]/pi_old[n]*(d_mean_d_ha3*Grad_Normal_Dist_Mean(mean_array[n],deviation_array[n],action_array[n])+d_dev_d_ha3*Grad_Normal_Dist_Deviation(mean_array[n],deviation_array[n],action_array[n])); + } + } + G_ha3[index1][index2] = G_ha3[index1][index2] / batch_size; + ha3_temp[index1][index2] = ha3_temp[index1][index2] - gradient_rate * G_ha3[index1][index2]; + } + + for (int n=0; n<batch_size; n++) { + if((advantage[n] >= 0.0f && ratio[n] >= 1.0f + epsilon) || (advantage[n] < 0.0f && ratio[n] < 1.0f - epsilon)) { + G_ba3[index2] = G_ba3[index2]; + } else { + + d_x_d_ba3[index2] = d_x_d_ba3[index2] + 1.0f; + d_y_d_ba3[index2] = d_y_d_ba3[index2] + 1.0f; + + float d_mean_d_ba3= 0.0f; + float d_dev_d_ba3= 0.0f; + d_mean_d_ba3 = exp(hxhh_a_sum_array[n][0])/(1.0f+exp(hxhh_a_sum_array[n][0]))*d_x_d_ba3[index2]; + d_dev_d_ba3 = exp(hxhh_a_sum_array[n][1])/(1.0f+exp(hxhh_a_sum_array[n][1]))*d_y_d_ba3[index2]; + + G_ba3[index2] = G_ba3[index2] + advantage[n]/pi_old[n]*(d_mean_d_ba3*Grad_Normal_Dist_Mean(mean_array[n],deviation_array[n],action_array[n])+d_dev_d_ba3*Grad_Normal_Dist_Deviation(mean_array[n],deviation_array[n],action_array[n])); + } + } + G_ba3[index2] = G_ba3[index2] / batch_size; + ba3_temp[index2] = ba3_temp[index2] - gradient_rate * G_ba3[index2]; + } } +///////////////////////////ReLU - Bad performance////////////////////////////////// +//void update_Actor_Networks(float (*arr)[num_input_RL]) +//{ +// float gradient_rate = 0.001f; //-0.01f +// +// float G_ha1[num_input_RL][num_hidden_unit] = {0.0f}; +// float G_ba1[num_hidden_unit] = {0.0f}; +// for (int index2 = 0; index2 < num_hidden_unit; index2++) { +// for (int index1 = 0; index1 < num_input_RL; index1++) { +// for (int i=0; i<batch_size; i++) { +// if((advantage[i] >= 0.0f && ratio[i] >= 1.0f + epsilon) || (advantage[i] < 0.0f && ratio[i] < 1.0f - epsilon)) { +// G_ha1[index1][index2] = G_ha1[index1][index2]; +// } else { +// +// float hx_sum_total = 0.0f; +// for(int m = 0; m < num_hidden_unit; m++) { +// for(int n = 0; n < num_input_RL; n++) { +// hx_sum_total = hx_sum_total + ha1_temp[n][m]*arr[i][n]; +// } +// } +// hx_sum_total = hx_sum_total + bc1_temp[index2]; +// float d_mean_d_ha1 = 0.0f; +// float d_dev_d_ha1 = 0.0f; +// if (hx_sum_total >=0) { +// float hx_sum = 0.0f; +// for(int j=0; j<num_input_RL; j++) { +// hx_sum = hx_sum + ha1_temp[j][index2]*arr[i][j]; +// } +// hx_sum = hx_sum + bc1_temp[index2]; +// if (hx_sum >= 0) { +// d_mean_d_ha1 = ha2_temp[index2][0]*arr[i][index1]; +// d_dev_d_ha1 = ha2_temp[index2][1]*arr[i][index1]; +// } else { +// d_mean_d_ha1 = 0.0f; +// d_dev_d_ha1 = 0.0f; +// } +// } else { +// d_mean_d_ha1 = 0.0f; +// d_dev_d_ha1 = 0.0f; +// } +// G_ha1[index1][index2] = G_ha1[index1][index2] + advantage[i]/pi_old[i]*(d_mean_d_ha1*Grad_Normal_Dist_Mean(mean_array[i],deviation_array[i],action_array[i])+d_dev_d_ha1*Grad_Normal_Dist_Deviation(mean_array[i],deviation_array[i],action_array[i])); +// } +// } +// G_ha1[index1][index2] = G_ha1[index1][index2] / batch_size; +// ha1_temp[index1][index2] = ha1_temp[index1][index2] - gradient_rate * G_ha1[index1][index2]; +// } +// for (int i=0; i<batch_size; i++) { +// if((advantage[i] >= 0.0f && ratio[i] >= 1.0f + epsilon) || (advantage[i] < 0.0f && ratio[i] < 1.0f - epsilon)) { +// G_ba1[index2] = G_ba1[index2]; +// } else { +// +// float hx_sum_total = 0.0f; +// for(int m = 0; m < num_hidden_unit; m++) { +// for(int n = 0; n < num_input_RL; n++) { +// hx_sum_total = hx_sum_total + ha1_temp[n][m]*arr[i][n]; +// } +// } +// hx_sum_total = hx_sum_total + bc1_temp[index2]; +// float d_mean_d_ba1 = 0.0f; +// float d_dev_d_ba1 = 0.0f; +// if (hx_sum_total >=0) { +// +// float hx_sum = 0.0f; +// for(int j=0; j<num_input_RL; j++) { +// hx_sum = hx_sum + ha1_temp[j][index2]*arr[i][j]; +// } +// hx_sum = hx_sum + bc1_temp[index2]; +// +// if(hx_sum >=0) { +// d_mean_d_ba1 = ha2_temp[index2][0]; +// d_dev_d_ba1 = ha2_temp[index2][1]; +// } else { +// d_mean_d_ba1 = 0.0f; +// d_dev_d_ba1 = 0.0f; +// } +// } else { +// d_mean_d_ba1 = 0.0f; +// d_dev_d_ba1 = 0.0f; +// } +// G_ba1[index2] = G_ba1[index2] + advantage[i]/pi_old[i]*(d_mean_d_ba1*Grad_Normal_Dist_Mean(mean_array[i],deviation_array[i],action_array[i])+d_dev_d_ba1*Grad_Normal_Dist_Deviation(mean_array[i],deviation_array[i],action_array[i])); +// } +// } +// G_ba1[index2] = G_ba1[index2] / batch_size; +// ba1_temp[index2] = ba1_temp[index2] - gradient_rate * G_ba1[index2]; +// } +// +// float G_ha2[num_hidden_unit][2] = {0.0f}; +// float G_ba2[2] = {0.0f}; +// for (int index2 = 0; index2 < 2; index2++) { +// for (int index1 = 0; index1 < num_hidden_unit; index1++) { +// for (int i=0; i<batch_size; i++) { +// if((advantage[i] >= 0.0f && ratio[i] >= 1.0f + epsilon) || (advantage[i] < 0.0f && ratio[i] < 1.0f - epsilon)) { +// G_ha2[index1][index2] = G_ha2[index1][index2]; +// } else { +// +// float hx_sum_total = 0.0f; +// for(int m = 0; m < num_hidden_unit; m++) { +// for(int n = 0; n < num_input_RL; n++) { +// hx_sum_total = hx_sum_total + ha1_temp[n][m]*arr[i][n]; +// } +// } +// hx_sum_total = hx_sum_total + bc1_temp[index2]; +// float d_mean_d_ha2 = 0.0f; +// float d_dev_d_ha2 = 0.0f; +// if (hx_sum_total >=0) { +// float hx_sum = 0.0f; +// for(int j=0; j<num_input_RL; j++) { +// hx_sum = hx_sum + ha1_temp[j][index1]*arr[i][j]; +// } +// hx_sum = hx_sum + bc1_temp[index1]; +// if (hx_sum >= 0) { +// d_mean_d_ha2 = hx_sum; +// d_dev_d_ha2 = hx_sum; +// } else { +// d_mean_d_ha2 = 0.0f; +// d_mean_d_ha2 = 0.0f; +// } +// } else { +// d_mean_d_ha2 = 0.0f; +// d_mean_d_ha2 = 0.0f; +// } +// G_ha2[index1][index2] = G_ha2[index1][index2] + advantage[i]/pi_old[i]*(d_mean_d_ha2*Grad_Normal_Dist_Mean(mean_array[i],deviation_array[i],action_array[i])+d_dev_d_ha2*Grad_Normal_Dist_Deviation(mean_array[i],deviation_array[i],action_array[i])); +// } +// } +// G_ha2[index1][index2] = G_ha2[index1][index2] / batch_size; +// ha2_temp[index1][index2] = ha2_temp[index1][index2] - gradient_rate * G_ha2[index1][index2]; +// } +// for (int i=0; i<batch_size; i++) { +// if((advantage[i] >= 0.0f && ratio[i] >= 1.0f + epsilon) || (advantage[i] < 0.0f && ratio[i] < 1.0f - epsilon)) { +// G_ba2[index2] = G_ba2[index2]; +// } else { +// +// float d_mean_d_ba2 = 0.0f; +// float d_dev_d_ba2 = 0.0f; +// d_mean_d_ba2 = 1.0f; +// d_dev_d_ba2 = 1.0f; +// G_ba1[index2] = G_ba1[index2] + advantage[i]/pi_old[i]*(d_mean_d_ba2*Grad_Normal_Dist_Mean(mean_array[i],deviation_array[i],action_array[i])+d_dev_d_ba2*Grad_Normal_Dist_Deviation(mean_array[i],deviation_array[i],action_array[i])); +// } +// } +// G_ba2[index2] = G_ba2[index2] / batch_size; +// ba2_temp[index2] = ba2_temp[index2] - gradient_rate * G_ba2[index2]; +// } +//} + float rand_normal(double mean, double stddev) @@ -664,31 +931,44 @@ } } + void Overwirte_Critic_Networks() { - for (int index2 = 0; index2 < num_hidden_unit; index2++) { + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { for (int index1 = 0; index1 < num_input_RL; index1++) { hc1[index1][index2] = hc1_temp[index1][index2]; } bc1[index2] = bc1_temp[index2]; - hc2[index2] = hc2_temp[index2]; } - bc2 = bc2_temp; + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + hc2[index1][index2] = hc2_temp[index1][index2]; + } + bc2[index2] = bc2_temp[index2]; + hc3[index2] = hc3_temp[index2]; + } + bc3 = bc3_temp; } void Overwirte_Actor_Networks() { - for (int index2 = 0; index2 < num_hidden_unit; index2++) { + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { for (int index1 = 0; index1 < num_input_RL; index1++) { ha1[index1][index2] = ha1_temp[index1][index2]; } ba1[index2] = ba1_temp[index2]; } - for (int index2 = 0; index2 < 2; index2++) { - for (int index1 = 0; index1 < num_hidden_unit; index1++) { + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { ha2[index1][index2] = ha2_temp[index1][index2]; } ba2[index2] = ba2_temp[index2]; } + for (int index2 = 0; index2 < 2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit2; index1++) { + ha3[index1][index2] = ha3_temp[index1][index2]; + } + ba3[index2] = ba3_temp[index2]; + } } @@ -775,25 +1055,38 @@ ID_index_array[i] = (i+1) * 0.5f; } - for (int index2 = 0; index2 < num_hidden_unit; index2++) { + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { for (int index1 = 0; index1 < num_input_RL; index1++) { hc1_temp[index1][index2] = (float) (rand()%100) * 0.01f ; } bc1_temp[index2] = (float) (rand()%100) * 0.01f; - hc2_temp[index2] = (float) (rand()%100) * 0.01f; + } + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + hc2_temp[index1][index2] = (float) (rand()%100) * 0.01f; + } + bc2[index2] = (float) (rand()%100) * 0.01f; + hc3[index2] = (float) (rand()%100) * 0.01f; } - bc2_temp = (float) (rand()%100) * 0.01f; - for (int index2 = 0; index2 < num_hidden_unit; index2++) { + bc3 = (float) (rand()%100) * 0.01f; + + for (int index2 = 0; index2 < num_hidden_unit1; index2++) { for (int index1 = 0; index1 < num_input_RL; index1++) { - ha1_temp[index1][index2] = (float) (rand()%100) * 0.01f; + ha1[index1][index2] = (float) (rand()%100) * 0.01f; } - ba1_temp[index2] = (float) (rand()%100) * 0.01f; + ba1[index2] = (float) (rand()%100) * 0.01f; + } + for (int index2 = 0; index2 < num_hidden_unit2; index2++) { + for (int index1 = 0; index1 < num_hidden_unit1; index1++) { + ha2[index1][index2] = (float) (rand()%100) * 0.01f; + } + ba2[index2] = (float) (rand()%100) * 0.01f; } for (int index2 = 0; index2 < 2; index2++) { - for (int index1 = 0; index1 < num_hidden_unit; index1++) { - ha2_temp[index1][index2] = (float) (rand()%100) * 0.01f; + for (int index1 = 0; index1 < num_hidden_unit2; index1++) { + ha3[index1][index2] = (float) (rand()%100) * 0.01f; } - ba2_temp[index2] = (float) (rand()%100) * 0.01f; + ba3[index2] = (float) (rand()%100) * 0.01f; } Overwirte_Critic_Networks(); @@ -935,10 +1228,19 @@ //float temp_array[3] = {state_array[i][0], state_array[i][1], state_array[i][2]}; float temp_array[2] = {state_array[i][0], state_array[i][1]}; V[i] = Critic_Network_Temp(temp_array); + for (int i=0; i<num_hidden_unit1; i++) { + hx_c_sum_array[RL_timer][i] = hx_c_sum[i]; + } + for (int i=0; i<num_hidden_unit2; i++) { + hxh_c_sum_array[RL_timer][i] = hxh_c_sum[i]; + } + hxhh_c_sum_array[RL_timer] = hxhh_c_sum; pi[i] = exp(-(action_array[i]-mean_array[i])*(action_array[i]-mean_array[i])/(2.0f*deviation_array[i]*deviation_array[i]))/(sqrt(2.0f*PI)*deviation_array[i]); Actor_Network_Old(temp_array); pi_old[i] = exp(-(action_array[i]-mean_old)*(action_array[i]-mean_old)/(2.0f*deviation_old*deviation_old))/(sqrt(2.0f*PI)*deviation_old); - r[i] = exp(-0.00005f * state_array[i][1] * 70.0f * state_array[i][1] * 70.0f); + r[i] = exp(-0.25f * state_array[i][1] * state_array[i][1]); + if(i == batch_size-1) return_G[i] = 0.0f; + else return_G[i] = gamma * return_G[i+1] + r[i]; if(i == batch_size-1) td_target[i] = r[i]; else td_target[i] = r[i] + gamma * V[i+1]; delta[i] = td_target[i] - V[i]; @@ -2375,6 +2677,14 @@ //float temp_array[3] = {train_set_x[RL_timer], train_set_error[RL_timer], train_set_count[RL_timer]}; float temp_array[2] = {train_set_x[RL_timer], train_set_error[RL_timer]}; Actor_Network(temp_array); + for (int i=0; i<num_hidden_unit1; i++) { + hx_a_sum_array[RL_timer][i] = hx_a_sum[i]; + } + for (int i=0; i<num_hidden_unit2; i++) { + hxh_a_sum_array[RL_timer][i] = hxh_a_sum[i]; + } + hxhh_a_sum_array[RL_timer][0] = hxhh_a_sum[0]; + hxhh_a_sum_array[RL_timer][1] = hxhh_a_sum[1]; mean_array[RL_timer] = mean; deviation_array[RL_timer] = deviation; mean_before_SP_array[RL_timer] = mean_before_SP;