SHOGUN  v3.2.0
DynProg.h
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Gunnar Raetsch
8  * Written (W) 1999-2009 Soeren Sonnenburg
9  * Written (W) 2008-2009 Jonas Behr
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
13 #ifndef __CDYNPROG_H__
14 #define __CDYNPROG_H__
15 
17 #include <shogun/lib/common.h>
18 #include <shogun/base/SGObject.h>
19 #include <shogun/io/SGIO.h>
20 #include <shogun/lib/config.h>
23 #include <shogun/structure/Plif.h>
31 #include <shogun/lib/Time.h>
32 
33 #include <stdio.h>
34 #include <limits.h>
35 
36 namespace shogun
37 {
38  template <class T> class CSparseFeatures;
39  class CIntronList;
40  class CPlifMatrix;
41  class CSegmentLoss;
42 
43  template <class T> class CDynamicArray;
44 
45 //#define DYNPROG_TIMING
46 
47 #ifdef USE_BIGSTATES
48 typedef uint16_t T_STATES ;
49 #else
50 typedef uint8_t T_STATES ;
51 #endif
52 typedef T_STATES* P_STATES ;
53 
54 #ifndef DOXYGEN_SHOULD_SKIP_THIS
55 
56 struct segment_loss_struct
57 {
59  int32_t maxlookback;
61  int32_t seqlen;
63  int32_t *segments_changed;
65  float64_t *num_segment_id;
67  int32_t *length_segment_id ;
68 };
69 #endif
70 
76 class CDynProg : public CSGObject
77 {
78 public:
83  CDynProg(int32_t p_num_svms=8);
84  virtual ~CDynProg();
85 
86  // model related functions
92  void set_num_states(int32_t N);
93 
95  int32_t get_num_states();
96 
98  int32_t get_num_svms();
99 
105  void init_content_svm_value_array(const int32_t p_num_svms);
106 
114  void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes);
115 
122  void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs);
123 
128  void resize_lin_feat(int32_t num_new_feat);
133  void set_p_vector(SGVector<float64_t> p);
134 
139  void set_q_vector(SGVector<float64_t> q);
140 
145  void set_a(SGMatrix<float64_t> a);
146 
151  void set_a_id(SGMatrix<int32_t> a);
152 
157  void set_a_trans_matrix(SGMatrix<float64_t> a_trans);
158 
163  void init_mod_words_array(SGMatrix<int32_t> p_mod_words_array);
164 
170  bool check_svm_arrays();
171 
176  void set_observation_matrix(SGNDArray<float64_t> seq);
177 
184  int32_t get_num_positions();
185 
195  void set_content_type_array(SGMatrix<float64_t> seg_path);
196 
201  void set_pos(SGVector<int32_t> pos);
202 
208  void set_orf_info(SGMatrix<int32_t> orf_info);
209 
214  void set_gene_string(SGVector<char> genestr);
215 
216 
221  void set_dict_weights(SGMatrix<float64_t> dictionary_weights);
222 
227  void best_path_set_segment_loss(SGMatrix<float64_t> segment_loss);
228 
235  void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
236 
238  void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2);
239 
244  void set_plif_matrices(CPlifMatrix* pm);
245 
246  // best_path result retrieval functions
251  SGVector<float64_t> get_scores();
252 
257  SGMatrix<int32_t> get_states();
258 
263  SGMatrix<int32_t> get_positions();
264 
265 
274  void compute_nbest_paths(int32_t max_num_signals,
275  bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences);
276 
278 
290  void best_path_trans_deriv(
291  int32_t* my_state_seq, int32_t *my_pos_seq,
292  int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals);
293 
294  // additional best_path_trans_deriv functions
299  void set_my_state_seq(int32_t* my_state_seq);
300 
305  void set_my_pos_seq(int32_t* my_pos_seq);
306 
314  void get_path_scores(float64_t** my_scores, int32_t* seq_len);
315 
323  void get_path_losses(float64_t** my_losses, int32_t* seq_len);
324 
325 
327  inline T_STATES get_N() const
328  {
329  return m_N ;
330  }
331 
336  inline void set_q(T_STATES offset, float64_t value)
337  {
338  m_end_state_distribution_q[offset]=value;
339  }
340 
345  inline void set_p(T_STATES offset, float64_t value)
346  {
347  m_initial_state_distribution_p[offset]=value;
348  }
349 
356  inline void set_a(T_STATES line_, T_STATES column, float64_t value)
357  {
358  m_transition_matrix_a.element(line_,column)=value; // look also best_path!
359  }
360 
366  inline float64_t get_q(T_STATES offset) const
367  {
368  return m_end_state_distribution_q[offset];
369  }
370 
376  inline float64_t get_q_deriv(T_STATES offset) const
377  {
378  return m_end_state_distribution_q_deriv[offset];
379  }
380 
386  inline float64_t get_p(T_STATES offset) const
387  {
388  return m_initial_state_distribution_p[offset];
389  }
390 
396  inline float64_t get_p_deriv(T_STATES offset) const
397  {
398  return m_initial_state_distribution_p_deriv[offset];
399  }
400 
404  void precompute_content_values();
405 
412  inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2)
413  {
414  m_lin_feat.get_array_size(dim1, dim2);
415  return m_lin_feat.get_array();
416  }
425  inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len)
426  {
427  m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true);
428  }
433  void create_word_string();
434 
437  void precompute_stop_codons();
438 
445  inline float64_t get_a(T_STATES line_, T_STATES column) const
446  {
447  return m_transition_matrix_a.element(line_, column); // look also best_path()!
448  }
449 
456  inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
457  {
458  return m_transition_matrix_a_deriv.element(line_, column); // look also best_path()!
459  }
461 
466  void set_intron_list(CIntronList* intron_list, int32_t num_plifs);
467 
470  {
471  return m_seg_loss_obj;
472  }
473 
480  void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
481  {
482  m_long_transitions = use_long_transitions;
483  m_long_transition_threshold = threshold;
484  SG_DEBUG("ignoring max_len\n")
485  //m_long_transition_max = max_len;
486  }
487 
488 protected:
489 
490  /* helper functions */
491 
501  void lookup_content_svm_values(const int32_t from_state,
502  const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
503  float64_t* svm_values, int32_t frame);
504 
512  inline void lookup_tiling_plif_values(const int32_t from_state,
513  const int32_t to_state, const int32_t len, float64_t* svm_values);
514 
519  inline int32_t find_frame(const int32_t from_state);
520 
529  inline int32_t raw_intensities_interval_query(
530  const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
531 
532 #ifndef DOXYGEN_SHOULD_SKIP_THIS
533 
534  struct svm_values_struct
535  {
537  int32_t maxlookback;
539  int32_t seqlen;
540 
542  int32_t* start_pos;
544  float64_t ** svm_values_unnormalized;
546  float64_t * svm_values;
548  bool *** word_used;
550  int32_t **num_unique_words;
551  };
552 #endif // DOXYGEN_SHOULD_SKIP_THIS
553 
562  bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
563 
565  virtual const char* get_name() const { return "DynProg"; }
566 
567 private:
568 
569  T_STATES trans_list_len;
570  T_STATES **trans_list_forward;
571  T_STATES *trans_list_forward_cnt;
572  float64_t **trans_list_forward_val;
573  int32_t **trans_list_forward_id;
574  bool mem_initialized;
575 
576 #ifdef DYNPROG_TIMING
577  CTime MyTime;
578  CTime MyTime2;
579  CTime MyTime3;
580 
581  float64_t segment_init_time;
582  float64_t segment_pos_time;
583  float64_t segment_clean_time;
584  float64_t segment_extend_time;
585  float64_t orf_time;
586  float64_t content_time;
587  float64_t content_penalty_time;
588  float64_t content_svm_values_time ;
589  float64_t content_plifs_time ;
590  float64_t svm_init_time;
591  float64_t svm_pos_time;
592  float64_t inner_loop_time;
593  float64_t inner_loop_max_time ;
594  float64_t svm_clean_time;
595  float64_t long_transition_time ;
596 #endif
597 
598 
599 protected:
604  int32_t m_N;
606 
611 
615 
619 
621 
623  int32_t m_num_degrees;
625  int32_t m_num_svms;
626 
649 
651 // CDynamicArray<int32_t> m_svm_pos_start;
657  int32_t m_max_a_id;
658 
659  // input arguments
665  int32_t m_seq_len;
688  uint16_t*** m_wordstr;
705 
709 
710  // output arguments
717 
724 
729 
733 
736 
742 
746  int32_t* m_probe_pos;
752  int32_t m_num_raw_data;
753 
763  //int32_t m_long_transition_max ;
764 
768  static int32_t word_degree_default[4];
769 
773  static int32_t cum_num_words_default[5];
774 
777  static int32_t frame_plifs[3];
778 
781  static int32_t num_words_default[4];
782 
784  static int32_t mod_words_default[32];
785 
787  static bool sign_words_default[16];
788 
790  static int32_t string_words_default[16];
791 };
792 }
793 #endif
Class Time that implements a stopwatch based on either cpu time or wall clock time.
Definition: Time.h:46
CDynamicArray< float64_t > m_end_state_distribution_q_deriv
Definition: DynProg.h:618
bool m_svm_arrays_clean
Definition: DynProg.h:655
CDynamicArray< float64_t > m_segment_loss
Definition: DynProg.h:692
CPlifMatrix * m_plif_matrices
Definition: DynProg.h:723
float64_t get_p(T_STATES offset) const
Definition: DynProg.h:386
CDynamicArray< int32_t > m_positions
Definition: DynProg.h:716
T_STATES * P_STATES
Definition: HMM.h:66
float64_t get_p_deriv(T_STATES offset) const
Definition: DynProg.h:396
uint16_t *** m_wordstr
Definition: DynProg.h:688
CDynamicArray< float64_t > m_dict_weights
Definition: DynProg.h:690
void set_lin_feat(float64_t *p_lin_feat, int32_t p_num_svms, int32_t p_seq_len)
Definition: DynProg.h:425
CDynamicArray< int32_t > m_segment_ids
Definition: DynProg.h:694
float64_t get_q_deriv(T_STATES offset) const
Definition: DynProg.h:376
bool m_long_transitions
Definition: DynProg.h:755
int32_t m_max_a_id
Definition: DynProg.h:657
CDynamicArray< int32_t > m_transition_matrix_a_id
transition matrix
Definition: DynProg.h:608
CDynamicArray< int32_t > m_word_degree
Definition: DynProg.h:628
int32_t * m_cum_num_words_array
Definition: DynProg.h:632
virtual const char * get_name() const
Definition: DynProg.h:565
int32_t * m_mod_words_array
Definition: DynProg.h:640
CDynamicArray< float64_t > m_lin_feat
Definition: DynProg.h:741
class IntronList
Definition: SegmentLoss.h:22
int32_t m_num_intron_plifs
Definition: DynProg.h:735
int32_t m_seq_len
Definition: DynProg.h:665
CDynamicArray< float64_t > m_initial_state_distribution_p_deriv
Definition: DynProg.h:614
int32_t m_num_degrees
Definition: DynProg.h:623
CSparseFeatures< float64_t > * m_seq_sparse1
Definition: DynProg.h:719
CSparseFeatures< float64_t > * m_seq_sparse2
Definition: DynProg.h:721
float64_t get_a_deriv(T_STATES line_, T_STATES column) const
Definition: DynProg.h:456
class Plif
Definition: Plif.h:38
int32_t * m_num_probes_cum
Definition: DynProg.h:748
float64_t get_a(T_STATES line_, T_STATES column) const
Definition: DynProg.h:445
int32_t * m_num_lin_feat_plifs_cum
Definition: DynProg.h:750
CDynamicArray< float64_t > m_initial_state_distribution_p
initial distribution of states
Definition: DynProg.h:613
CDynamicArray< int32_t > m_mod_words
Definition: DynProg.h:638
class IntronList
Definition: IntronList.h:20
CDynamicArray< float64_t > m_transition_matrix_a_deriv
Definition: DynProg.h:610
void set_p(T_STATES offset, float64_t value)
Definition: DynProg.h:345
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:102
CDynamicArray< int32_t > m_num_unique_words
Definition: DynProg.h:653
CDynamicArray< float64_t > m_transition_matrix_a
Definition: DynProg.h:609
CDynamicArray< bool > m_genestr_stop
Definition: DynProg.h:728
CDynamicArray< int32_t > m_orf_info
Definition: DynProg.h:667
double float64_t
Definition: common.h:48
CDynamicArray< float64_t > m_segment_sum_weights
Definition: DynProg.h:669
int32_t * m_string_words_array
Definition: DynProg.h:648
CDynamicArray< int32_t > m_my_pos_seq
Definition: DynProg.h:700
CDynamicArray< int32_t > m_states
Definition: DynProg.h:714
CDynamicArray< int32_t > m_cum_num_words
Definition: DynProg.h:630
CDynamicArray< int32_t > m_string_words
Definition: DynProg.h:646
CDynamicArray< int32_t > m_pos
Definition: DynProg.h:663
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
int32_t m_long_transition_threshold
Definition: DynProg.h:758
CDynamicArray< float64_t > m_end_state_distribution_q
distribution of end-states
Definition: DynProg.h:617
float64_t * get_lin_feat(int32_t &dim1, int32_t &dim2)
Definition: DynProg.h:412
void set_a(T_STATES line_, T_STATES column, float64_t value)
Definition: DynProg.h:356
float64_t get_q(T_STATES offset) const
Definition: DynProg.h:366
float64_t * m_raw_intensities
Definition: DynProg.h:744
void set_q(T_STATES offset, float64_t value)
Definition: DynProg.h:336
CDynamicArray< float64_t > m_my_losses
Definition: DynProg.h:704
CDynamicArray< float64_t > m_segment_mask
Definition: DynProg.h:696
CDynamicArray< char > m_genestr
Definition: DynProg.h:673
CDynamicArray< float64_t > m_scores
Definition: DynProg.h:712
uint8_t T_STATES
Definition: HMM.h:64
#define SG_DEBUG(...)
Definition: SGIO.h:109
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:16
CDynamicArray< float64_t > m_observation_matrix
Definition: DynProg.h:661
CDynamicArray< bool > m_sign_words
Definition: DynProg.h:642
CDynamicArray< int32_t > m_num_words
Definition: DynProg.h:634
CDynamicObjectArray m_plif_list
Definition: DynProg.h:671
CDynamicArray< float64_t > m_my_scores
Definition: DynProg.h:702
int32_t * m_probe_pos
Definition: DynProg.h:746
Dynamic Programming Class.
Definition: DynProg.h:76
int32_t * m_num_words_array
Definition: DynProg.h:636
CIntronList * m_intron_list
Definition: DynProg.h:732
int32_t m_num_raw_data
Definition: DynProg.h:752
T_STATES get_N() const
access function for number of states N
Definition: DynProg.h:327
int32_t m_num_svms
Definition: DynProg.h:625
void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
Definition: DynProg.h:480
CSegmentLoss * m_seg_loss_obj
Definition: DynProg.h:708
CSegmentLoss * get_segment_loss_object()
Definition: DynProg.h:469
CDynamicArray< int32_t > m_my_state_seq
Definition: DynProg.h:698
store plif arrays for all transitions in the model
Definition: PlifMatrix.h:29
bool * m_sign_words_array
Definition: DynProg.h:644

SHOGUN Machine Learning Toolbox - Documentation