00001 #ifndef _DIVINE_NETWORK_HH_
00002 #define _DIVINE_NETWORK_HH_
00003
00006 #ifndef DOXYGEN_PROCESSING
00007
00008 #undef SEEK_SET
00009 #undef SEEK_END
00010 #undef SEEK_CUR
00011
00012 #define OMPI_IGNORE_CXX_SEEK
00013
00014 #include <vector>
00015 #include <queue>
00016 #include <memory>
00017 #include <cmath>
00018 #include <mpi.h>
00019 #include "common/error.hh"
00020 #include "common/sysinfo.hh"
00021 #include "system/state.hh"
00022 #include "distributed/message.hh"
00023
00024 using namespace std;
00025
00026 namespace divine {
00027 #endif // DOXYGEN_PROCESSING
00032 const int NET_TAG_NORMAL = 0;
00037 const int NET_TAG_URGENT = 1;
00038
00040 const int NETWORK_ERR_TYPE = 1729;
00042 const int NET_NO_ERROR = 0;
00044 const int NET_ERR_ALREADY_INITIALIZED = 1;
00046 const int NET_ERR_NOT_INITIALIZED = 2;
00048 const int NET_ERR_INITIALIZATION_FAILED = 3;
00050 const int NET_ERR_FINALIZATION_FAILED = 4;
00052 const int NET_ERR_INVALID_MSG_SIZE = 5;
00054 const int NET_ERR_INVALID_DESTINATION = 6;
00056 const int NET_ERR_INVALID_SOURCE = 7;
00058 const int NET_ERR_SEND_MSG_FAILED = 8;
00060 const int NET_ERR_MSG_PROBE_FAILED = 9;
00062 const int NET_ERR_RECEIVE_MSG_FAILED = 10;
00064 const int NET_ERR_GET_MSG_SIZE_FAILED = 11;
00066 const int NET_ERR_BARRIER_FAILED = 12;
00068 const int NET_ERR_ABORT_FAILED = 13;
00070 const int NET_ERR_GATHER_FAILED = 14;
00072 const int NET_ERR_ALLGATHER_FAILED = 15;
00074 const int NET_ERR_INVALID_WORKSTATION_NUMBER = 16;
00076 const ERR_triplet_t net_err_msgs[17] = {
00077 ERR_triplet_t("No error.",
00078 NETWORK_ERR_TYPE,
00079 NET_NO_ERROR),
00080 ERR_triplet_t("Error: Network already initialized.",
00081 NETWORK_ERR_TYPE,
00082 NET_ERR_ALREADY_INITIALIZED),
00083 ERR_triplet_t("Error: Network not initialized.",
00084 NETWORK_ERR_TYPE,
00085 NET_ERR_NOT_INITIALIZED),
00086 ERR_triplet_t("Error: Network initialization failed.",
00087 NETWORK_ERR_TYPE,
00088 NET_ERR_INITIALIZATION_FAILED),
00089 ERR_triplet_t("Error: Network finalization failed.",
00090 NETWORK_ERR_TYPE,
00091 NET_ERR_FINALIZATION_FAILED),
00092 ERR_triplet_t("Error: Invalid message size.",
00093 NETWORK_ERR_TYPE,
00094 NET_ERR_INVALID_MSG_SIZE),
00095 ERR_triplet_t("Error: Invalid destination.",
00096 NETWORK_ERR_TYPE,
00097 NET_ERR_INVALID_DESTINATION),
00098 ERR_triplet_t("Error: Invalid source.",
00099 NETWORK_ERR_TYPE,
00100 NET_ERR_INVALID_SOURCE),
00101 ERR_triplet_t("Error: Send message failed.",
00102 NETWORK_ERR_TYPE,
00103 NET_ERR_SEND_MSG_FAILED),
00104 ERR_triplet_t("Error: Message probe failed.",
00105 NETWORK_ERR_TYPE,
00106 NET_ERR_MSG_PROBE_FAILED),
00107 ERR_triplet_t("Error: Receive message failed.",
00108 NETWORK_ERR_TYPE,
00109 NET_ERR_RECEIVE_MSG_FAILED),
00110 ERR_triplet_t("Error: Get msg size failed.",
00111 NETWORK_ERR_TYPE,
00112 NET_ERR_GET_MSG_SIZE_FAILED),
00113 ERR_triplet_t("Error: Barrier failed.",
00114 NETWORK_ERR_TYPE,
00115 NET_ERR_BARRIER_FAILED),
00116 ERR_triplet_t("Error: Abort failed.",
00117 NETWORK_ERR_TYPE,
00118 NET_ERR_ABORT_FAILED),
00119 ERR_triplet_t("Error: Gather failed.",
00120 NETWORK_ERR_TYPE,
00121 NET_ERR_GATHER_FAILED),
00122 ERR_triplet_t("Error: AllGather failed.",
00123 NETWORK_ERR_TYPE,
00124 NET_ERR_ALLGATHER_FAILED),
00125 ERR_triplet_t("Error: Invalid workstation number.",
00126 NETWORK_ERR_TYPE,
00127 NET_ERR_INVALID_WORKSTATION_NUMBER)
00128 };
00129
00130 class comm_matrix_t;
00131
00133
00137 typedef auto_ptr<comm_matrix_t> pcomm_matrix_t;
00138
00140
00145 class comm_matrix_t {
00146 private:
00147 int **elems;
00148 int rowcount, colcount;
00149 protected:
00150 error_vector_t& errvec;
00151 public:
00153
00160 comm_matrix_t(int rows, int col, error_vector_t& arg0 = gerr);
00162 ~comm_matrix_t();
00164
00167 int getrowcount(void) { return rowcount; }
00169
00172 int getcolcount(void) { return colcount; }
00174
00182 int& operator()(int row, int col);
00184
00188 friend pcomm_matrix_t operator-(const pcomm_matrix_t& m);
00190
00195 friend pcomm_matrix_t operator-(const pcomm_matrix_t& m1, const pcomm_matrix_t& m2);
00197
00202 friend pcomm_matrix_t operator+(const pcomm_matrix_t& m1, const pcomm_matrix_t& m2);
00204
00211 friend pcomm_matrix_t operator*(int a, const pcomm_matrix_t& m);
00213 friend pcomm_matrix_t operator*(const pcomm_matrix_t& m, int a);
00214 };
00215
00217
00220 class network_t {
00221 public:
00222
00223
00224
00225
00226 #define OPT_STATS
00227 #define OPT_STATS_MPI_RATE
00228
00229
00230
00231
00232
00233 #if 0
00234 #define OPT_STATS_MPI_RATE_PRINT
00235 #define OPT_STATS_MPI_PENDING
00236 #define OPT_STATS_TAGS
00237 #define OPT_GRID_CONFIG
00238 #endif
00239 #if 0
00240 #define OPT_STATS_MPI_CALLS
00241 #endif
00242
00243 #if defined(OPT_STATS_TAGS)
00244 #define TAGS_MAX 256
00245 #define TAGS_MASK (TAGS_MAX - 1)
00246 int tag_count[TAGS_MAX];
00247 #endif
00248
00249 #if defined(OPT_GRID_CONFIG)
00250
00251 char *grid_config;
00252 #endif
00253
00254 #if defined(OPT_STATS)
00255 struct statistics {
00256 unsigned long long num;
00257 double sum;
00258 double sum_sq;
00259 double min;
00260 double max;
00261 };
00262
00263 void stats_init(statistics *stats) {
00264 stats->num = 0;
00265 stats->sum = 0.0;
00266 stats->sum_sq = 0.0;
00267 stats->min = 9e99;
00268 stats->max = 0.0;
00269 }
00270
00271 void stats_update(statistics *stats, double val) {
00272 stats->num++;
00273 stats->sum += val;
00274 stats->sum_sq += val * val;
00275 if (val > stats->max)
00276 {
00277 stats->max = val;
00278 }
00279 if (val < stats->min)
00280 {
00281 stats->min = val;
00282 }
00283 }
00284
00285 unsigned stats_num(statistics *stats) {
00286 return stats->num;
00287 }
00288
00289 double stats_min(statistics *stats) {
00290 if (stats->num > 0)
00291 {
00292 return stats->min;
00293 }
00294
00295 return 0.0;
00296 }
00297
00298 double stats_max(statistics *stats) {
00299 return stats->max;
00300 }
00301
00302 double stats_mean(statistics *stats) {
00303 if (stats->num > 0)
00304 {
00305 return stats->sum / stats->num;
00306 }
00307
00308 return 0.0;
00309 }
00310
00311 double stats_var(statistics *stats) {
00312 if (stats->num > 1)
00313 {
00314 double var;
00315
00316 var = (stats->sum_sq - (stats->sum * stats->sum) / stats->num) /
00317 (stats->num - 1);
00318
00319
00320
00321 if (var >= 0.0)
00322 {
00323 return var;
00324 }
00325 }
00326
00327 return 0.0;
00328 }
00329
00330 double stats_sqrt_var(statistics *stats) {
00331 double var = stats_var(stats);
00332
00333 if (var > 0.0)
00334 {
00335 return sqrt(var);
00336 }
00337
00338 return 0.0;
00339 }
00340
00341 #endif
00342
00343 #if defined(OPT_STATS_MPI_RATE)
00344 #define STATS_INTERVAL 3.0
00345 double stats_time_begin, stats_slice_begin, stats_slice_end;
00346
00347 statistics stats_Sent_bytes_local;
00348 statistics stats_Recv_bytes;
00349 #if defined(OPT_GRID_CONFIG)
00350 statistics stats_Sent_bytes_remote;
00351 #endif
00352 #endif
00353
00354 #if defined(OPT_STATS_MPI_RATE_PRINT)
00355 vminfo_t vm;
00356 #endif
00357
00358 #if defined(OPT_STATS_MPI_CALLS)
00359 statistics stats_Issend, stats_Issend_urg, stats_Test,
00360 stats_Isend, stats_Isend_urg,
00361 stats_Probe, stats_Iprobe0, stats_Iprobe1,
00362 stats_Recv, stats_Barrier, stats_Gather, stats_Allgather;
00363 #endif
00364
00365 private:
00366
00367
00368
00369
00370
00371 struct net_send_buffer_t { char* ptr; int size; int msgs_cnt; timeinfo_t flush_timeout;
00372 bool pending; MPI_Request req; net_send_buffer_t *next; };
00373 struct net_send_buffers_t { net_send_buffer_t *first; net_send_buffer_t *current;
00374 net_send_buffer_t *last; int total_allocated_size; int total_pending_size;
00375 int all_msgs_cnt; int all_urgent_msgs_cnt; int flushes_cnt; };
00376 struct net_recv_buffer_t { char* ptr; int position; int size; int all_msgs_cnt; };
00377 struct net_urgent_recv_buffer_t { char* ptr; int size; int all_urgent_msgs_cnt; };
00378
00379 enum net_comm_matrix_type_t { ncmt_snm, ncmt_rnm, ncmt_sum, ncmt_rum };
00380
00381 bool fnetwork_initialized;
00382 bool finitialized;
00383 int flast_mpi_rc;
00384 int flast_net_rc;
00385 bool ftimed_flushing_enabled;
00386
00387 vector<net_send_buffers_t> *send_buffer;
00388 vector<net_recv_buffer_t> *recv_buffer;
00389 vector<net_urgent_recv_buffer_t> *urgent_recv_buffer;
00390 char *tmp_rbuf;
00391 char *tmp_sbuf;
00392 int* tmp_buf_cl_size;
00393
00394
00395 queue<int> recv_buffers_queue;
00396
00397 queue<int> urgent_recv_buffers_queue;
00398
00399
00400 int fbuf_msgs_cnt_limit;
00401 int fbuf_size_limit;
00402 int fbuf_time_limit_sec;
00403 int fbuf_time_limit_msec;
00404
00405
00406 int fstat_sent_messages_cnt;
00407 int fstat_sent_messages_size;
00408 int fstat_sent_urgent_messages_cnt;
00409 int fstat_sent_urgent_messages_size;
00410
00411 int fstat_received_messages_cnt;
00412 int fstat_received_messages_size;
00413 int fstat_received_urgent_messages_cnt;
00414 int fstat_received_urgent_messages_size;
00415
00416 int fstat_all_barriers_cnt;
00417
00418 int fid;
00419 int fcluster_size;
00420 char fprocessor_name[MPI_MAX_PROCESSOR_NAME + 1];
00421
00422 int size_of_int;
00423
00424 bool get_comm_matrix(net_comm_matrix_type_t cm_type, pcomm_matrix_t& ret, int target);
00425 bool send_message_ex(char* buf, int size, int dest, int tag, bool urgent);
00426 bool message_probe(int src, int tag, bool &flag, int &size,
00427 MPI_Status& status, bool blocking);
00428 bool is_new_message_ex(int& size, int& src, int& tag, bool& flag,
00429 bool urgent, bool from_source, bool blocking);
00430 bool receive_message_from_network(char *buf, int size, int src, int tag,
00431 MPI_Status& status);
00432 bool receive_message_ex(char *&buf, int &size, int& src, int& tag,
00433 bool non_exc, bool from_source, bool called_internally);
00434 bool receive_urgent_message_ex(char *&buf, int &size, int& src, int& tag,
00435 bool non_exc, bool from_source, bool called_internally);
00436 public:
00439 network_t(error_vector_t& arg0 = gerr);
00441 ~network_t();
00442
00445
00456 bool initialize_network(int &argc, char **&argv);
00458
00462 bool initialize_buffers();
00464 bool finalize();
00466
00473
00474
00478 bool barrier(void);
00480
00498 bool gather(char *sbuf, int ssize, char *rbuf, int rsize, int root);
00500
00502 bool all_gather(char *sbuf, int ssize, char *rbuf, int rsize);
00504
00506 bool abort(void);
00508
00525 bool send_message(char* buf, int size, int dest, int tag);
00526
00528
00544 bool send_message(const message_t & message, int dest, int tag);
00545
00547
00550 bool send_urgent_message(char* buf, int size, int dest, int tag);
00551
00553
00556 bool send_urgent_message(const message_t & message, int dest, int tag);
00558
00563 bool flush_buffer(int dest);
00564
00566
00570 bool flush_all_buffers();
00571 #if !defined(ORIG_FLUSH)
00573
00578 bool flush_some_buffers();
00579 #endif
00581
00585 bool flush_all_buffers_timed_out_only();
00587
00600 bool is_new_message(int& size, int& src, int& tag, bool& flag);
00602
00604 bool is_new_message_from_source(int& size, int src, int& tag, bool& flag);
00606
00608 bool is_new_urgent_message(int& size, int& src, int& tag, bool& flag);
00610
00613 bool is_new_urgent_message_from_source(int& size, int src, int& tag, bool& flag);
00615
00629 bool receive_message(char *buf, int& size, int& src, int& tag);
00630
00632
00645 bool receive_message(message_t & message, int& src, int& tag);
00647
00652 bool receive_message_non_exc(char *&buf, int& size, int& src, int& tag);
00654
00656 bool receive_message_from_source(char *buf, int& size, int src, int& tag);
00658
00660 bool receive_urgent_message(char *buf, int& size, int& src, int& tag);
00662
00665 bool receive_urgent_message_non_exc(char *&buf, int& size, int& src, int& tag);
00667
00670 bool receive_urgent_message_from_source(char *buf, int& size, int src, int& tag);
00671
00672
00673
00674
00676
00688 bool get_comm_matrix_snm(pcomm_matrix_t& ret, int target);
00689
00691
00695 bool get_comm_matrix_rnm(pcomm_matrix_t& ret, int target);
00696
00698
00701 bool get_comm_matrix_sum(pcomm_matrix_t& ret, int target);
00702
00704
00708 bool get_comm_matrix_rum(pcomm_matrix_t& ret, int target);
00709
00711
00723 bool get_buf_msgs_cnt_limit(int& limit);
00725
00738 bool get_buf_size_limit(int& limit);
00740
00760 bool get_buf_time_limit(int& limit_sec, int& limit_msec);
00761
00763 bool set_buf_msgs_cnt_limit(int limit);
00765 bool set_buf_size_limit(int limit);
00767 bool set_buf_time_limit(int limit_sec, int limit_msec);
00768
00770
00773 bool get_all_sent_msgs_cnt(int& cnt);
00775
00778 bool get_all_received_msgs_cnt(int& cnt);
00779
00780
00782
00786 bool get_sent_msgs_cnt_sent_to(int to,int& cnt);
00788
00792 int get_sent_msgs_cnt_sent_to(int to);
00793
00795
00799 bool get_recv_msgs_cnt_recv_from(int from,int& cnt);
00801
00804 int get_recv_msgs_cnt_recv_from(int from);
00805
00806
00808
00811 bool get_user_sent_msgs_cnt(int& cnt);
00813
00816 bool get_user_received_msgs_cnt(int& cnt);
00817
00819
00822 bool get_all_barriers_cnt(int& cnt);
00823
00825
00829 bool get_buffer_flushes_cnt(int dest, int& cnt);
00831
00834 bool get_all_buffers_flushes_cnt(int& cnt);
00836
00842 bool get_total_pending_size(int dest, int& size, bool test);
00843 #if defined(OPT_STATS_MPI_PENDING)
00845 bool get_total_pending_stats(int dest, int& size, int& curpkts, int& maxpkts, bool test);
00846 #endif
00848
00853 bool get_all_total_pending_size(int& size, bool test);
00854 #if defined(OPT_STATS_MPI_PENDING)
00856 bool get_all_total_pending_stats(int& size, int& curpkts, int& maxpkts, bool test);
00857 #endif
00859
00868 bool get_id(int &id);
00870
00877 bool get_cluster_size(int &size);
00879
00886 bool get_processor_name(char *proc_name, int& length);
00887
00888 protected:
00889 error_vector_t& errvec;
00890
00891 };
00892
00893 #ifndef DOXYGEN_PROCESSING
00894 }
00895 #endif // DOXYGEN_PROCESSING
00896
00897 #endif
00898
00899
00900
00901
00902
00903
00904
00905
00906
00907
00908
00909
00910
00911
00912
00913
00914
00915
00916
00917
00918
00919
00920
00921
00922
00923