Compare commits

...

2881 Commits

Author SHA1 Message Date
0b92e5c9ed fix static linkage and make THD statically linked 2017-08-28 10:41:55 -04:00
df44c571c6 increase test subprocess timeout 2017-08-25 09:00:36 -07:00
ed03f74043 fix leaking symbols in THNN 2017-08-25 09:00:35 -07:00
c8d8803b90 Remove unnecessary moves in convolution autograd. 2017-08-25 09:00:35 -07:00
750245f990 Remove unnecessary moves, avoid IncRef/DecRef of PyBools. 2017-08-25 09:00:35 -07:00
fbf573a6b8 Properly pass saved_for in BatchNorm/Conv as the relevant Backward function.
Previously, these Functions passed themselves, i.e. the saved_for from
ConvForward would be ConvForward.
2017-08-25 09:00:35 -07:00
9bda6dee8f Add AutoGPU guard and properly reference Python args from BatchNormBackwardBackward. 2017-08-25 09:00:35 -07:00
e02f7bf8a3 Update autograd notes (#2295) 2017-08-04 20:28:04 -04:00
7ea48eaf7a cuda 7.5 fix for gloo 2017-08-04 02:25:16 -04:00
d278a14141 Fix ZeroPad2d backwards with negative pads. 2017-08-03 21:16:37 -04:00
6b1ca4b4b6 variable shape error of LSTMCell, GRUCell (#2289) 2017-08-03 21:16:06 -04:00
65ddaf13a9 Improve cuDNN weight layout test 2017-08-03 02:06:27 -04:00
96156013c3 Make sure deserialized RNN modules have _data_ptrs too 2017-08-03 02:06:21 -04:00
a997cdbb25 Fix BatchNorm double backwards when training=False.
Changes for v.0.2.0 around using shared_ptrs rather than at::Tensors.
2017-08-03 10:47:34 +05:30
8db9df94b6 Merge commit '74e5328b03634e163df65d6c6877c6f03387b536' 2017-08-02 22:51:17 -04:00
6c9e3334b1 Merge commit '70c95dbe52102d70facf7fc5d31cb8bd9ae860d9' 2017-08-02 22:50:52 -04:00
b33f232678 disable cudnn when output_padding >= stride or dilation 2017-08-02 22:48:03 -04:00
058f50aa50 fix shape and correctness bugs in autograd/convolution BackwardBackward 2017-08-02 22:48:03 -04:00
8b06efea7a remove dead code for python ConvNd (moved to C already) 2017-08-02 22:48:03 -04:00
52b7a49b37 enable cudnn transposed dilated 2017-08-02 22:48:03 -04:00
47f4d549e0 refactoring the THNN calls in autograd/convolution.cpp to be more compact 2017-08-02 22:48:03 -04:00
5b6d1837c7 enable dilated transpose and gradgrad tests 2017-08-02 22:48:02 -04:00
69642d4423 add THNN bindings for DilatedConvTranspose in autograd/convolution 2017-08-02 22:48:02 -04:00
70c95dbe52 fix Conv3d non-contiguous weight bug 2017-08-02 22:47:09 -04:00
74e5328b03 remove limitations on output_padding in Conv* routines 2017-08-02 22:46:24 -04:00
814b65df4f remove limitations on output_padding in Conv* routines 2017-08-02 22:46:04 -04:00
a565b77791 add 2d and 3d dilated full Convolution 2017-08-02 22:44:59 -04:00
6e6dca001c add 2d and 3d dilated full Convolution 2017-08-02 22:44:44 -04:00
daf5b20cd7 Add tests that gradcheck grad sizes match input size and fix advanced indexing
case that fails check.
2017-08-02 07:13:01 +05:30
515efdab5d add reentrancy checking for gradcheck. 2017-08-02 07:13:01 +05:30
f9f98daf11 Remove save_mean/save_var from BatchNorm double backwards, as it's not needed.
These could cause a problem with double backwards because they were std::move'd in
Backward.
2017-08-02 07:13:01 +05:30
2ac1003228 Implement LogSoftmax (v.0.2.0) (#2265) 2017-08-01 14:32:05 +05:30
141224ad7c Implement SoftMax and NLLLoss double backwards. (#2233)
* Implement SoftMax and NLLLoss double backwards.

* Update legacy ClassNLLCriterion to add ignore_index.

* Fix serialization of legacy ClassNLLCriterion with ignore_index.
2017-07-30 09:02:04 +05:30
ac76ab5fca Increase tol. for float tensor qr big test.
test_FloatTensor_qr_big test is still a bit flaky on K80. Increasing tolerance to improve reliability as tests are moved around and results change for this test.
2017-07-27 14:23:06 -04:00
04f31aa034 Improve Variable.retain_grad 2017-07-27 20:36:14 +05:30
ae59e008cd add retain_grad method, to variable, so gradient gets stored during backpop, on non-user variables 2017-07-27 20:36:14 +05:30
e25b3d7bc5 replace lon glong types with size_t (#1267)
Work around bug in msvc compiler in win32 mode
2017-07-27 19:13:56 +05:30
925208af72 Implement BatchNorm double backwards (#2207)
* Implement BatchNorm double backwards as a python function called directly from C++.

This will be converted to C++ code once ATen is integrated with autograd.

* Some performance improvements via inplace ops and reusing calculations.
2017-07-27 06:00:31 +05:30
643f8d12ff [bugfix] in bce_with_logits logsumexp calculation (#2221)
* fix bug in bce_with_logits logsumexp calculation

* flake8 fix
2017-07-27 05:58:56 +05:30
fb8f9de498 fix for ATen API Change 2017-07-26 18:55:56 -04:00
cb9ad7a892 Opt into Trusty builds. (#2214)
* Opt into Trusty builds.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Bump to 2.7.9.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-07-27 04:04:57 +05:30
f7de7bab6e Merge commit 'fd97d92479e32e550866adfd1f0465e4cfa5e581' 2017-07-26 18:11:16 -04:00
fd97d92479 allow retain to be specified for unsafeTensorFromTH 2017-07-26 14:58:32 -07:00
f3aa97f169 Deduplicate THPUtils_checkLong/THPUtils_unpackLong (#2218)
There were two implementations of THPUtils_checkLong/THPUtils_unpackLong; one
that was a macro and one that was not, which is hella bad if you accidentally
include the macro before the real definition.  Now we always use the inline
function.

A reasonable follow-up task would be to un-macro-ify the rest of these functions.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-07-27 03:12:12 +05:30
b0648fc3fc Merge commit 'be9ef9283f297997afd3bf8e21147ec6bf09ebbf' 2017-07-26 17:25:39 -04:00
be9ef9283f Merge pull request #35 from ezyang/pr/undefined-dim-doc
Note [Undefined-dim versus 0-dim]
2017-07-26 12:42:33 -07:00
9c0d52a32f fix osx build errors related to long/int64_t 2017-07-26 12:36:25 -07:00
54545c2154 Note [Undefined-dim versus 0-dim]
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-07-26 12:34:13 -07:00
9ec7051442 Remove __func__ hack in auto nn. 2017-07-26 15:28:25 -04:00
2676c6357f Enable Conv groups gradgradchecks. (#2216) 2017-07-27 00:24:12 +05:30
ef3b09fb5f fix a bug where some scalars were getting truncated to integers incorrectly. 2017-07-25 14:27:16 -07:00
f194ac1e09 Merge pull request #477 from wickedfoo/feature_lp_pooling
GPU implementation of L_p feature pooling
2017-07-26 02:31:59 +05:30
26a0b9aa43 Merge pull request #1259 from wickedfoo/feature_lp_pooling
CPU implementation of L_p feature pooling
2017-07-26 02:31:50 +05:30
e548580f31 Add missing models to torch vision documentation (#2204) 2017-07-26 01:58:18 +05:30
421607a935 DataParallel device_ids slicing fixes (#2200) 2017-07-26 01:54:38 +05:30
7be545292d Update cudnn.py 2017-07-25 09:35:44 -04:00
a0e83280ef Update cudnn.py 2017-07-25 09:35:44 -04:00
aa35be2032 search for cudnn in conda 2017-07-25 09:35:44 -04:00
626840aef3 C function wrapper uniqueness (#1912)
* add SharedFunctionMaker to create Function shared in the graph

* Clean shared_ptr usage for only function that will be used in the graph

* make Function binding match Varible one

* remove unnecessary changes

* fix comments

* proper weakref implementation

* add call to clear in dealloc
2017-07-25 13:12:54 +05:30
bcea678e7b Update rebased functions to call apply. 2017-07-25 07:37:25 +05:30
1a52ca02ef Always return indices from MaxPool autograd functions to simplify implementation;
The callers (in functional.py) will filter out the return instead.
2017-07-25 07:37:25 +05:30
84314859af Implement double backwards for MaxPool2d. 2017-07-25 07:37:25 +05:30
9c2beb33c5 Implement double backwards for MaxPool1d. 2017-07-25 07:37:25 +05:30
7deba74969 Implement MaxPool{1d,2d,3d}Backwards (non-differentiable) functions. 2017-07-25 07:37:25 +05:30
48bb07a4db Implement double backwards for AvgPool3d. 2017-07-25 07:37:25 +05:30
bb86ed7b97 Implement double backward for AvgPool1d, AvgPool2d, LPPool2d. 2017-07-25 07:37:25 +05:30
291369ff1b Convert pooling functions to new-style, once_differentiable functions. 2017-07-25 07:37:25 +05:30
2118400e18 Fix lint. 2017-07-25 07:37:25 +05:30
39934da8b3 Address review comments. 2017-07-25 07:37:25 +05:30
c12b494329 Implement double backwards for ELU. 2017-07-25 07:37:25 +05:30
506d52dc33 Add check_gradgrad=False for new NLLLoss2d test. 2017-07-25 07:37:25 +05:30
7687c2677a Fix double backwards advanced indexing derivative wrt grad_output.
Also small legacy nn test issue and unrelated syntax issue.
2017-07-25 07:37:25 +05:30
97d21e243b Implement L1Cost double backwards. 2017-07-25 07:37:25 +05:30
0bda56956e Implement double backwards for auto-generated HardTanh. 2017-07-25 07:37:25 +05:30
40af93bb57 Optimize PReLU double backwards via a PReLUBackwards autograd function. 2017-07-25 07:37:25 +05:30
9608e37969 Implement double backwards for PReLU. 2017-07-25 07:37:25 +05:30
ec7c510557 Implement Softsign double backwards. 2017-07-25 07:37:25 +05:30
8636be3880 Ensure gradients wrt grad_outputs are checked in gradgradcheck. 2017-07-25 07:37:25 +05:30
fb2284f3a0 Add gradgrad checks for NN module and criterion tests. 2017-07-25 07:37:25 +05:30
9ec9dee27d Implement NN Criterion functions as potentially double backwards functions. 2017-07-25 07:37:25 +05:30
7b6aab9079 Unify implementation of _Loss and _WeightedLoss autograd functions. 2017-07-25 07:37:25 +05:30
852dd5f011 Convert _WeightedLoss functions to new style autograd functions. 2017-07-25 07:37:25 +05:30
085abee444 Rebase kl_div changes. 2017-07-25 07:37:25 +05:30
48b85fe012 Implement THNN non-criterion Functions as new style with backward/backward. 2017-07-25 07:37:25 +05:30
45ce4df74c Convert auto nn Functions (non-criterion) to new style. 2017-07-25 07:37:25 +05:30
5695cbf986 Add comments in loss.py and distance.py (#2189)
* Add examples in CrossEntropyLoss

1. Added examples in CrossEntropyLoss
2. Make consistent style of example for PyTorch docs
3. Delete unnecessary character '

* Change comments in distance.py

1. Delete x1, x2 from arguments and add eps in PariwiseDistance
2. For the shape, added input1 and input2 for readability (PairwiseDistance and CosineSimilarity.

* Add examples

Added the word 'examples' for PyTorch docs
2017-07-25 07:36:28 +05:30
03df5debe3 Gloo fixes for Linux + old cmake (2.8.0) + old glibc (CentOS6) 2017-07-24 21:59:58 -04:00
2ebdef0154 Add 'torch/lib/gloo/' from commit '1978bba3e421eceab6181bcbc838553091cedecc'
git-subtree-dir: torch/lib/gloo
git-subtree-mainline: ceb4f84d12304d03a6a46693e54390869c0c208e
git-subtree-split: 1978bba3e421eceab6181bcbc838553091cedecc
2017-07-24 21:59:49 -04:00
ceb4f84d12 Improve memory usage of cuDNN RNN modules (#2179) 2017-07-25 04:00:17 +05:30
112728cbe9 reformulate bce_with_logits to not use abs (#2195)
* reformulate bce_with_logits to not use abs

* flake8 fixes
2017-07-25 03:46:27 +05:30
dc17fb68e4 Fix minor bug in parallel_apply (#2193) 2017-07-25 03:45:00 +05:30
4a4d8841e6 Delete unused import 2017-07-23 12:48:11 -04:00
3c275fe7a0 Increase flaky test tolerance (#2185) 2017-07-22 11:37:34 -04:00
1978bba3e4 comment out unused parameters
Summary: This uses `clang-tidy` to comment out unused parameters (in functions, methods and lambdas) in fbcode. Cases that the tool failed to handle are fixed manually.

Reviewed By: igorsugak

Differential Revision: D5454343

fbshipit-source-id: 5dee339b4334e25e963891b519a5aa81fbf627b2
2017-07-21 14:57:12 -07:00
35757af6f7 Add broadcasting of weights to bce/bce_with_logits (#2161)
* added tests + removed explicit expand of weight in bce with logits

* add auto broadcasting of weight to BCELoss

* remove the need for _BCELoss

* formatting of warning

* remove TODO

* move across assert from _functions/thnn/loss.py

* flake8 fixes
2017-07-21 16:02:07 -04:00
8ab3d214d5 Fixes for DistributedDataParallel (#2168) 2017-07-21 16:00:46 -04:00
ec2def803b Merge commit '2efac3ed83a29f57f914e9044fdddd2ce7ecd6b7' 2017-07-21 15:58:23 -04:00
71ce3448d9 Fix torch.inverse when magma is not available
Fixes #2156
2017-07-21 15:57:43 -04:00
2efac3ed83 Fix torch.inverse when magma is not available
Fixes #2156
2017-07-21 15:57:25 -04:00
66bbe5d75a .creator -> .grad_fn in the code example (#2171) 2017-07-21 14:43:16 -04:00
ea607afd06 Add comments in nn.Upsample (#2175) 2017-07-21 14:34:58 -04:00
4f035f14de Add a support matrix for distributed backends 2017-07-21 14:19:46 -04:00
72e9e7abf7 Warning squash.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-07-21 14:13:11 -04:00
4d45ce7d11 Added UpSampling module and associated tests. 2017-07-21 12:25:50 +01:00
eed323c344 avoid warning 2017-07-20 10:59:56 -07:00
ea6f9a26b8 fix version number 2017-07-20 13:30:53 -04:00
3719b4247a return a sentinel value when THTensor has undefined dimensions. 2017-07-20 10:25:30 -07:00
bf1fc250d1 get conda root dir automatically, trick from Dockerfile 2017-07-20 11:02:30 -04:00
47942307b5 Comment that data of THStorage may be NULL.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-07-20 10:55:35 -04:00
6b69723d4f Document how Numpy memory management works.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-07-20 10:55:35 -04:00
5254846bb2 fix typo of error msg of cmul in THSTensorMath (#2158) 2017-07-20 02:58:54 -04:00
f3f478960e Convert Embedding to new style. (#1916)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-07-20 02:35:21 -04:00
e537023147 add functional embedding (#1987) 2017-07-20 01:53:37 -04:00
09abaa2189 make keepdim backcompat warnings emit in autograd as well (#2157) 2017-07-20 01:48:05 -04:00
575a4a98e0 Remove assertions with side effects 2017-07-20 01:45:57 -04:00
02e23f4f6b Unify argument names in tensor and Variable methods 2017-07-20 01:45:57 -04:00
8946502348 Accept all kinds of arguments in Variable.expand 2017-07-20 01:45:57 -04:00
e708de37cc Allow keyword args in long_arg options 2017-07-20 01:45:57 -04:00
4af40e3471 Let parallel_apply accept arbitrary inputs 2017-07-20 01:45:57 -04:00
f417cb062b Fix repeat backward to handle unsqueezed dims 2017-07-20 01:45:57 -04:00
11f3ccf98f Add missing Modules to nn.functional (#1801)
* add dropout2d and dropout3d to functional

added some loss functions to functional

added tests

using dropout from backend

added docs

fixes

* edited loss modules to call functional
2017-07-19 15:55:21 -04:00
31894cafdd add support for advanced indexing with less than ndim indexers, ellipsis (#2144) 2017-07-19 15:51:03 -04:00
95ccbf8b0b better error message in load_state_dict when there are inconsistent tensor sizes (#2151) 2017-07-19 15:50:29 -04:00
a5422d14c8 Merge commit 'bd6263c338c717de880cddfed660b5aa06ee108b' 2017-07-19 15:48:54 -04:00
82143487b3 Add CUDA support for arange
Also enables CUDA for range
2017-07-19 15:48:20 -04:00
bd6263c338 Add CUDA support for arange
Also enables CUDA for range
2017-07-19 15:43:00 -04:00
f4a565ded9 Merge commit '1c6a08c1c2a50a7048ae9e6e11290740d24a8374' 2017-07-19 15:42:20 -04:00
1c6a08c1c2 fix lint 2017-07-19 12:41:17 -07:00
a5c2546c0f version bump 2017-07-19 12:34:43 -07:00
13e84e460b Use unaligned store intrinsic to enable vectorized reductions on unaligned buffers
Summary: When performing reductions on fp16 buffers, gloo assumed that both buffers were either aligned to 32 bytes or misaligned by the same offset. This may not hold in intermediate steps of halving-doubling allreduce, when the reduction is performed on some offset within the receive buffer. The fix is to use intrinsic instructions that work with unaligned pointers.

Reviewed By: akyrola

Differential Revision: D5450103

fbshipit-source-id: 9a1c8f8c34d2e62223f6d5c21573ea1cfad6537f
2017-07-19 11:06:32 -07:00
4d5d9de541 Merge commit '768b7c0dee34b614ab1cd8f89c69ec7d86c19c88' 2017-07-19 12:22:36 -04:00
9da882e396 Merge commit 'ae3a8d5d2eaa1b15d825b86ce706b046e68733b8' 2017-07-19 12:21:52 -04:00
15bece50d1 Merge commit 'cfcf2af95f91a88ec61cbcac8b30a718e7332aa5' 2017-07-19 12:20:54 -04:00
8144f7c95d Merge commit '58334a0c4b3c386931293f7fbee3d2cf066221a5' 2017-07-19 12:20:20 -04:00
b660303a16 Static linking against libstdc++ in Binary Build mode 2017-07-19 12:19:36 -04:00
768b7c0dee Static linking against libstdc++ in Binary Build mode 2017-07-19 11:23:31 -04:00
ae3a8d5d2e Static linking against libstdc++ in Binary Build mode 2017-07-19 11:23:21 -04:00
58334a0c4b static MKL detection and linkage fixes 2017-07-19 11:22:46 -04:00
cfcf2af95f add explicit BLAS linkage to THC when linked against magma (in binary build) 2017-07-19 11:22:23 -04:00
f3df24269d Merge commit '975550512200cfa1ae18e21400e7efa3924a3d46' 2017-07-19 11:05:51 -04:00
c4120f34bf move to model with cuda indexing tensors for cuda tensor adv indexing 2017-07-19 11:05:10 -04:00
9755505122 move to model with cuda indexing tensors for cuda tensor adv indexing 2017-07-19 11:04:49 -04:00
8b42308f71 Bug in line 381 (sparse) (#2130)
The function iterates over columns and sets "sparsity" fraction of entires in each column to 0. The number of zeros in a column (num_zeros) is then ceil(rows*sparsity)
2017-07-18 22:55:06 -04:00
685ae4813e Squash "macro expansion producing 'defined' has undefined behavior" warnings.
Fixes #2141.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-07-18 22:24:55 -04:00
a0fef9dd22 Merge commit '703429d49eb397102ba20e6d4c0dd7714be001a5' 2017-07-18 20:17:26 -04:00
703429d49e Make clang shut up about class/struct mismatch.
Makes us -Werror clean again, I think.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-07-18 20:16:20 -04:00
567d95fa09 Merge pull request #25 from killeent/nullable-tensors
add support for Null Tensors to functions
2017-07-18 17:35:02 -04:00
7914d67ce3 Merge pull request #20 from killeent/type-equality
operator== for type
2017-07-18 14:32:45 -07:00
8451468d8b still generate multiple versions 2017-07-18 14:31:35 -07:00
138b216686 add support for Null Tensors to functions 2017-07-18 07:51:51 -07:00
6f6d70ffed Merge commit 'dc5854477951765f5edbac34b0c228449de1b56b' 2017-07-18 01:34:54 -04:00
dc58544779 fix baddbmm for expanded tensors 2017-07-18 01:33:59 -04:00
e13704c467 fix shadowed variable name
Summary: When compiled with -Werror=shadow-compatible-local, cannot reuse a variable name. This passed our tests, but some people use stronger settings to compile.

Differential Revision: D5440805

fbshipit-source-id: a246af748717fb7e0e7a321e1ac4ddfef68ae524
2017-07-17 19:10:30 -07:00
e9dd8e0e3b Use one key for all pairs per node
Summary: To reduce round trips with store handlers, it is better to store all addresses in one key instead of one address per pair. This is what this implements.

Reviewed By: andrewwdye

Differential Revision: D5435893

fbshipit-source-id: 2d3ea3a2822c3b934ff2578d44a262e7bfbde6d0
2017-07-17 17:35:19 -07:00
a3c9054245 Add comments in loss.py (#2128) 2017-07-17 13:56:19 -04:00
c7b624651e CodeMod: Prefer ADD_FAILURE() over EXPECT_TRUE(false), et cetera
Summary:
CodeMod: Prefer `ADD_FAILURE()` over `EXPECT_TRUE(false)`, et cetera.

The tautologically-conditioned and tautologically-contradicted boolean expectations/assertions have better alternatives: unconditional passes and failures.

Reviewed By: Orvid

Differential Revision:
D5432398

Tags: codemod, codemod-opensource

fbshipit-source-id: d16b447e8696a6feaa94b41199f5052226ef6914
2017-07-16 21:24:13 -07:00
ba544aa0ad Add comments in nn.ELU (#2111) 2017-07-16 23:04:11 -04:00
849fb1f7e3 Fix when running with python -O (#2120) 2017-07-16 13:51:14 -04:00
16dd997239 Spelling tweaks for documentation (#2114) 2017-07-15 13:16:32 -07:00
1c0135b6f2 CreateCommonWorld: pass timeout for storehandler
Summary: Use the CreateCommonWorld timeout for the storehandler as well, not just the device connect.

Reviewed By: andrewwdye

Differential Revision: D5425923

fbshipit-source-id: 936d2129e2db3bfed8759ca097b75843d3931d5f
2017-07-14 19:20:11 -07:00
a7d82b935f Merge commit '9851ef4979bad0c8618e586e711c1bfd8648fd52' 2017-07-14 17:31:21 -04:00
af7aea9f17 Merge commit 'f805a8388be8dc55af0e3aa165b13cd0fce484d3' 2017-07-14 17:29:50 -04:00
366299f9f3 Wrap unbiased flag in var, std, varall, stdall 2017-07-14 17:29:06 -04:00
9851ef4979 Wrap unbiased flag in var, std, varall, stdall 2017-07-14 17:28:14 -04:00
f805a8388b Wrap unbiased flag in var, std, varall, stdall 2017-07-14 17:25:25 -04:00
2f7b6db429 Merge commit 'd2874c560ebd197297ef737a084b6f7ee3f03dc6' 2017-07-14 17:21:16 -04:00
16203f3325 fix test 2017-07-14 17:04:21 -04:00
80d067e70f retain_variables -> retain_graph (#2107)
Closes #1928
2017-07-14 16:45:25 -04:00
d2874c560e lint fixes 2017-07-14 16:32:15 -04:00
83596bdcb1 produce a Declarations.yaml file that describes Functions/Type/Tensor methods that framework produced. 2017-07-14 12:34:03 -07:00
f3f8ce44bd Merge pull request #18 from soumith/master
Fix handling of if_true/if_false in ATen
2017-07-14 15:16:07 -04:00
33ac9cdc10 add ATen tensor support to pytorch tuple_parser (#2102) 2017-07-14 13:56:02 -04:00
38ba935547 operator== for type 2017-07-14 10:39:40 -07:00
128e02d792 allow type inference to work on TensorList 2017-07-14 10:27:05 -07:00
7ee7542fc8 Fix handling of if_true/if_false in ATen 2017-07-14 11:58:03 -04:00
52a9367fa7 Fix minor typo (#2100)
Fixed minor typo in Autograd mechanics docs.
2017-07-14 10:20:13 -04:00
08bb3b7cc8 Merge commit '7e498d2219c8dbeb801fc4cefa36b147bbf76ff4' 2017-07-14 02:55:55 -04:00
43eaa28b9f fix empty Tensor mmap 2017-07-14 02:55:05 -04:00
7e498d2219 fix empty Tensor mmap 2017-07-14 02:54:39 -04:00
d6bc2642e7 Add ignore_index to NLLLoss2d 2017-07-13 23:22:48 -04:00
7d3511f5f2 Half fixes for ATen and CUDA 9.0 2017-07-13 22:52:39 -04:00
a5a8ab10b0 fix Hardtanh argument names to be consistent between functional and Module 2017-07-13 22:46:51 -04:00
25b591eb05 lint fixes 2017-07-13 22:41:01 -04:00
06f94a7d59 better error message when thread_local is not supported (#2092) 2017-07-13 22:32:10 -04:00
027264cd64 Merge commit '9e720f15477d2d7a388c5b5ec7d397fa5706d64f' 2017-07-13 19:59:07 -04:00
7c14c377df Merge commit 'd8fee1ebe675b9d31894ac79145f2b2629e322e4' 2017-07-13 19:25:56 -04:00
c674923bcc Merge commit 'ed6f5d7038f0e3873c2ed6add2ede7c9ab38e1ea' 2017-07-13 19:24:22 -04:00
d8fee1ebe6 add launch_bounds to greedy kernels 2017-07-13 19:23:29 -04:00
ed6f5d7038 add launch_bounds to greedy kernels 2017-07-13 19:23:24 -04:00
9e720f1547 fix bug in method declarations 2017-07-13 16:22:52 -07:00
ab26fa01e6 install vision in devel dockerfile, minor fixes to dockerfile (#2090) 2017-07-13 19:06:41 -04:00
f4ae64a6c7 add isCUDA() on Type 2017-07-13 15:13:20 -07:00
07fcd977bb add cudnn data type processing for ATen tensor (#2087) 2017-07-13 16:37:53 -04:00
54cabb8bf3 Correct negative dim behavior in torch.stack (#2084)
Fixes #1950
2017-07-13 16:29:31 -04:00
42485d87c2 Set the current device in each engine's thread (#2081)
Fixes #2017
2017-07-13 16:24:38 -04:00
007d6ad816 write generated_cpp. to a file rather than as output to make error reporting clearer. 2017-07-13 11:04:52 -07:00
abd433fa07 Merge commit '6db960fbcff7ae194c6827c73113c222391f2c3e' 2017-07-13 13:49:26 -04:00
6db960fbcf dont clobber gen.py error, fix for old versions of python 2017-07-13 10:45:14 -07:00
384f03f1be Merge commit '48b797a785c1fc6ea34398985c49b2c7c55d28ae' 2017-07-13 10:40:58 -04:00
c011d4f3d6 resolves #1991 (#2073) 2017-07-13 09:57:33 -04:00
f98c384973 Raise error when call from_numpy on 0-dim array (#2075)
* Raise error when call from_numpy on 0-dim array

Fixes: #2055

* reword error message
2017-07-13 09:56:12 -04:00
48b797a785 fix lint 2017-07-13 03:22:31 -04:00
8983bf13f4 fix max and min docs 2017-07-13 03:03:27 -04:00
20ce45b0c3 fix EmbeddingSum offsets initialization 2017-07-13 02:57:25 -04:00
1e98155711 long ->size_t 2017-07-13 02:40:44 -04:00
1c14178c65 fix osx compilation 2017-07-13 02:38:56 -04:00
37183e91de add normalize docs to sphinx 2017-07-13 02:31:57 -04:00
14337693d0 Merge commit 'b900a49308cb0363d00add7e123b824fda3eab37' 2017-07-13 01:01:38 -04:00
58e4caf80f add missing docs 2017-07-13 01:01:04 -04:00
b900a49308 Merge pull request #11 from soumith/master
Fix ATen build for debug python
2017-07-12 21:51:36 -07:00
c888857461 Conv double backward groups (#1993)
* add support for groups in double backward

* add tests for group in double backward

* fix lint

* separate some tests to reduce number of test cases

* remove redundant testing for different number of output channels
2017-07-13 00:41:14 -04:00
7053b84c0e Merge commit '41abcd4b41308b3453cce6731d896d094b23c62a' 2017-07-13 00:39:35 -04:00
8304dc4d68 Merge commit '703ccbb8cbe1c4ce3eeb62548ce51f71181883d6' 2017-07-13 00:39:03 -04:00
c48d50a2e2 Advanced Indexing: Calculate linear offsets directly on the GPU when working with CUDA Tensors 2017-07-13 00:38:23 -04:00
41abcd4b41 Advanced Indexing: Calculate linear offsets directly on the GPU when working with CUDA Tensors 2017-07-13 00:37:20 -04:00
703ccbb8cb Advanced Indexing: Calculate linear offsets directly on the GPU when working with CUDA Tensors 2017-07-13 00:37:13 -04:00
27da4eafc2 Remove more advanced indexing duplicate tests (#2071) 2017-07-13 00:30:52 -04:00
459cb697b5 Merge commit 'ce96b84ccbdfbbee7f744942b1bb9fdc5924e442' 2017-07-13 00:26:06 -04:00
ce96b84ccb Check for shared_mem size in multinomial single-sample implementation
Handle limited shared memory on function torch.multinomial

Update THCTensorRandom.cu
2017-07-13 00:25:13 -04:00
feddb03d58 LP pooling kernels 2017-07-12 19:31:06 -07:00
fe3802d724 match PyTorch syntax 2017-07-12 16:58:57 -07:00
b8d0c7fc0d checked cast does it all 2017-07-12 14:41:04 -07:00
ea563c1df1 Make weight norm pickleable (#2066) 2017-07-12 17:21:22 -04:00
2520459617 cpu lp pooling 2017-07-12 14:21:17 -07:00
841173c530 Use NamedTemporaryFile to avoid filename collisions (#2069) 2017-07-12 17:14:42 -04:00
f4c502e8a8 basic cat implementation in ATen 2017-07-12 12:04:24 -07:00
593c5e12e1 Merge commit 'be18499e852d8b292491e27d87dadebe68931fc3' 2017-07-12 14:55:21 -04:00
dc2ed7fd33 Fix ATen build for debug python 2017-07-12 14:52:03 -04:00
81fd2bf2d0 fix some language / typos 2017-07-12 14:47:36 -04:00
8915e2710c Refactor scatter/gather and add distributed docs 2017-07-12 14:47:36 -04:00
ebd5c085dc Fix a memory leak in DataChannelTCP 2017-07-12 14:47:36 -04:00
a9759ef401 Fix undefined symbol errors in THD 2017-07-12 14:47:36 -04:00
f899eafe85 Merge commit '5894864a1c5c9596da0ae88b477ee421e3a5065b' 2017-07-12 14:33:47 -04:00
169ca67a4e Adding Spatial Transformers w/CuDNN support 2017-07-12 14:32:06 -04:00
5894864a1c Adding Spatial Transformers w/CuDNN support 2017-07-12 14:31:14 -04:00
41c8fee3e7 Merge commit '7c10f1b932fbebdf0e9105f2848229ea22109747' 2017-07-12 12:57:52 -04:00
bb891758bf Merge commit 'a20729244b43f7072797cc5e93898df795455e5b' 2017-07-12 12:57:12 -04:00
7c10f1b932 Avoid two unnecessary copies in addmm backward
The `r_` and `t` tensors become different objects, even though they
point to the same data. Avoid the copy whenever beta=0.
2017-07-12 12:56:17 -04:00
a20729244b Avoid two unnecessary copies in addmm backward
The `r_` and `t` tensors become different objects, even though they
point to the same data. Avoid the copy whenever beta=0.
2017-07-12 12:56:08 -04:00
a74fb22b9a fix inplace division for python3 (#2063) 2017-07-12 11:37:55 -04:00
0d91048639 add dummy tensor.data property, to provide interpretable error message to users (#2058) 2017-07-12 10:22:08 -04:00
10e23943b3 Fix missing _forward_pre_hooks in serialized modules (#2057) 2017-07-11 18:23:35 -04:00
be18499e85 Fix a few C++ warnings
1) Type needs a virtual dtor
2) Tensor move ctor should be noexcept
3) Make constructors from Context* and Type* explicit
2017-07-11 15:18:15 -07:00
1037f30e41 add some documentation to Tensor 2017-07-11 11:00:45 -07:00
78ecc2d3b1 Alias multinomial sampling in Cuda (#784)
* Support Multinomial Alias sampling in cuda

Moving benchmark file

* Review changes
2017-07-11 13:23:35 -04:00
f483679425 Implementation of Alias Multinomial for faster Multinomial sampling (#1046) 2017-07-11 13:22:36 -04:00
dfd5d8d0fe Avoid two unnecessary copies in addmm backward (#1971)
The `r_` and `t` tensors become different objects, even though they
point to the same data. Avoid the copy whenever beta=0.
2017-07-11 11:55:22 -04:00
158c7e86dd add basic gitignore, thpp -> at doc fix 2017-07-11 08:32:58 -07:00
73128f7b08 fix minor typos (#2051)
* Update extending.rst

fix typo

* Update cuda.rst

fix typo
2017-07-11 11:01:41 -04:00
f536c662bf fix op in docs (#2048) 2017-07-11 10:36:19 -04:00
2ecb18881c add DynamicType variants for ATen functions. 2017-07-11 10:35:03 -04:00
9d8cff9bc1 initialize aten and pytorch to share the same THCState 2017-07-11 10:35:03 -04:00
ab3d85c410 add build commands for ATen 2017-07-11 10:35:03 -04:00
e58e27cf16 Add 'torch/lib/ATen/' from commit '9d0c674cb7bcfae989d69f988363c1688c22fa89'
git-subtree-dir: torch/lib/ATen
git-subtree-mainline: 3314d51dcc1535dc2d00d357be889807d1bb8c57
git-subtree-split: 9d0c674cb7bcfae989d69f988363c1688c22fa89
2017-07-11 10:33:24 -04:00
3314d51dcc Add __repr__ to Avgpool and maxunpool layers (#2047) 2017-07-11 10:13:22 -04:00
1ef1dd9cad Add comments for readability (#2005) 2017-07-10 23:02:56 -07:00
98206c326e Fix ref counting in wrapped tuple functions (#2042)
Fixes #1963
2017-07-10 18:46:06 -04:00
9d0c674cb7 always use a custom default float 2017-07-10 15:37:18 -07:00
bff762c3ff python style fixes 2017-07-10 15:37:07 -07:00
10a8ccf27f only test gets for advanced indexing with duplicates (#2041) 2017-07-10 16:05:55 -04:00
0a9e8a23ef add atan2 function to autograd (#2040) 2017-07-10 16:04:35 -04:00
8b003565ec remove inaccessible median variant (#2015)
With the addition of medianall() this variant can no longer be accessed, because both it and  medianall take no arguments.
2017-07-10 10:42:45 -04:00
53ac2d46c6 Fix typos in docstrings. (#2034) 2017-07-10 10:35:46 -04:00
318ea29a86 Merge commit 'ab3a9e177ee5eb7d39de2d385ba1e141858e8329' 2017-07-10 10:30:24 -04:00
ab3a9e177e Fix sdot_ bug for runtime F2C symbol conflicts by using cblas where available 2017-07-10 10:29:26 -04:00
46a868dab7 [Ready] Limit docs line length (#1900)
* some docs are ready

* docs

* docs

* fix some more

* fix some more
2017-07-10 10:24:54 -04:00
581921f696 support unsafe functions for getting/constructor tensors from TH objects for backward compat. 2017-07-09 21:25:38 -07:00
0025e1c776 Fix typos in the docstrings of Conv3d, AvgPool3d and MaxPool3d (#2030)
* Fix a typo of the docstring of Conv3d

* Fix typos in docstrings of 3D operations.
2017-07-09 23:20:07 -04:00
9cba97a833 Pairwise-exchange benchmark with bandwidth measurement
Summary: A simple benchmark to determine network bandwidth for pairwise communication.

Reviewed By: plapukhov

Differential Revision: D5159607

fbshipit-source-id: d16c3ed3a0c2ae182138df91bdae821f5508c6ac
2017-07-09 15:55:20 -07:00
c6d7e1e6bf added input size checks to batchnorm (#2020) 2017-07-09 15:31:24 -04:00
49f679d0e9 Acknowledge the existence of cpu HalfTensor (#2018) 2017-07-08 10:03:36 -04:00
f0788afb0c lazily initialize cuda so that we behave similar to PyTorch 2017-07-07 22:21:31 -07:00
a4dc7dcd04 osx build issues and clang warnings 2017-07-07 11:50:02 -07:00
5dd05ed8ee remove Sparse from dispatch for now, will add dispatch variants later 2017-07-07 11:40:08 -07:00
0a34f05d5b Always include THNN in the build, don't check for CUDA twice
As a result, the project builds on MacOS with gcc-6 (without CUDA).
2017-07-07 14:14:02 -04:00
4fda678a85 fix build issue when cuda does not exist 2017-07-07 10:54:17 -07:00
ebdec9a837 Skip distributed tests if not supported (#2004) 2017-07-07 11:06:56 -04:00
c3c7845572 added asserts that grad_output + input are contiguous (#2000) 2017-07-07 09:14:02 -04:00
90d0762d14 Use torch.arange instead of torch.range in test_torch.py (#1996) 2017-07-07 00:06:31 -04:00
73fead9f8f add shape alias (#1983) 2017-07-05 19:12:37 -04:00
3748b6d3eb Data parallel fix for https://github.com/pytorch/pytorch/issues/1857 (#1880)
* Data parallel fix for https://github.com/pytorch/pytorch/issues/1857
searches recursively for variable in input

* parallel_apply.py lint
2017-07-05 11:46:00 -04:00
b3589b04fd Fix exceptions not being caught (#1948)
Adding -fexceptions to both torch and pytorch C/C++ builds fixes tests
not passing.

Closes #1297
2017-07-05 00:25:39 -04:00
5964394a4c return empty iter when tensor is empty 2017-07-04 17:29:27 -04:00
1aaa24d99b add medianall prototype to docs 2017-07-04 16:52:36 -04:00
295ed7e264 Merge commit 'ab7d4e2bcea5cae8f05873fb0bbb31985cc58d47' 2017-07-04 16:47:48 -04:00
ab7d4e2bce add missing definition 2017-07-04 16:46:04 -04:00
ae65236490 Fix typo 2017-07-04 15:19:05 -04:00
c2069a15e0 Merge commit '56df97ce939985a30dcfefb1136bf45faf64413c' 2017-07-04 15:18:14 -04:00
56df97ce93 remove unnecessary contiguous assertion 2017-07-04 15:17:15 -04:00
89c682dfb9 Merge commit '0dbf871d9ec424f1a7897af77bf93219d3be23bf' 2017-07-04 14:56:53 -04:00
ae839f4b2e Merge commit 'f425c5216b7fe35dd03e0161a3440ec968c63636' 2017-07-04 14:56:22 -04:00
05c2bafc9d Have median reduce over all dims and return just the value when dim is not provided 2017-07-04 14:55:37 -04:00
0dbf871d9e Have median reduce over all dims and return just the value when dim is not provided 2017-07-04 14:55:30 -04:00
f425c5216b Have median reduce over all dims and return just the value when dim is not provided 2017-07-04 14:55:19 -04:00
635bb5ec9d corrects typo 2017-07-04 11:09:40 -04:00
a7f6b0ab4f Merge commit 'e5bac2dd2d69772938482c1431db1fc1efb64c6f' 2017-07-03 20:41:28 -04:00
e5bac2dd2d Add critical section to BLAS gemm.
This is needed because of possible races in SpatialConvolutionMM (and others that use gemm)
if the BLAS library is not thread-safe.

In terms of performance, there's not much benefit to run two gemms in parallel, because the
BLAS libraries have their own all-occupying gemms anyways.
2017-07-03 20:40:21 -04:00
ec8da55a7d bind THS THCS, leaving all operators unimplemented. This is required because THPP can represent Sparse tensors even though the wrapper doesn't implement any operators. 2017-07-03 16:52:41 -07:00
b4414c0dc3 Handle None in modules list.
It's often useful to add None to an nn.ModuleList to keep the indexing
of the module list to match some other property.
2017-07-03 18:53:21 -04:00
39edc378fb Fix lint. 2017-07-03 18:51:22 -04:00
f6578c1b24 Implement double backwards for Dropout and FeatureDropout. 2017-07-03 18:51:22 -04:00
daa84e7663 Implement bilinear double backward. 2017-07-03 18:51:22 -04:00
1aa145dbac Implement ConstantPad2d double backwards. 2017-07-03 18:51:22 -04:00
d4b8834131 Improve non-contiguous testing in TestAutograd: (#1933)
* Improve non-contiguous testing in TestAutograd:
1) Test gradcheck and gradgradcheck with non-contiguous inputs
2) Test gradgradcheck with non-contiguous gradoutputs (gradcheck would take more work)
3) Fix discovered issue in Prod backwards.

* Simplify non-contiguous setting wrt View.
2017-07-03 18:49:52 -04:00
699d1ec7fb Address flaky Norm test issues:
1) Add a correction for 1.5 norms to ensure input can't be zero.
2) Increase test tolerance.
2017-07-03 18:48:22 -04:00
05062a1439 Better handle random seeds in tests.
Previously, there were 2 issues with test_autograd randomness:
1) Many random operations (e.g. random selection in prod_zeros) happened
   before the torch random seed was set (because it was set in run_tests
   at the end of the file.
2) The random seed was not set consistently: run_tests would set it to the
   proper value, but each call to setUp would set it to 0 (because SEED wasn't
   global in run_tests), which made setting the seed mostly worthless.
2017-07-03 18:48:22 -04:00
e187ba7a9f Decrease likelyhood that Fmod/Remainder tests fail due to numerical jacobian check.
Previously, these tests added 5e-2 to the denominator tensor (the same as the div
tests), which only avoids divide by 0, but not issues with computing the numerical
jacobian due to non-linearity of fmod/remainder, when input / divisor is close to an
integer.  These tests now add 1.5 to the denominator, which is the same as the non-tensor
version of the tests; Note that we can still hit the above condition but it will be much
less likely.
2017-07-03 18:48:22 -04:00
35ed224d04 Merge commit '8a24f2b4d8646de10b497c2eca2f1edc525a1e09' 2017-07-03 00:49:59 -04:00
72b292d45c Merge commit '733a7c6d9a22dfc9be1b11d47384991208658bfb' 2017-07-03 00:49:52 -04:00
5b4cd9bb49 Merge commit 'c691fc6dc711814a06107d4a9b763f34bff5afca' 2017-07-03 00:49:34 -04:00
c691fc6dc7 Add a nonContigDim reduction kernel to improve latency for small tensors. (#768) 2017-07-03 00:39:40 -04:00
42cf68b402 Make reduction functors accept only constant arguments (#753)
(similar to MaxValuePair and MinValuePair above).
2017-07-03 00:35:39 -04:00
8a65ef1098 cc 2.0 -> 3.0 in docs. 2017-07-02 22:08:42 -04:00
406040f6a9 fix torch.is_tensor not recognizing HalfTensor (#1934) 2017-07-02 10:13:44 -04:00
e26139b7f7 fixed shapes in GRU and LSTM docs. 2017-07-01 23:15:10 -04:00
457587088a Fix broadcasting issues in binary_cross_entropy_with_logits (#1944)
* done re-seed cuda device if in bad fork

* avoid broadcasting in binary_cross_entropy_with_logits

* assert input sizes for BCEWithLogitLoss

* added check that BCEWithLogitsLoss == Sigmoid + BCELoss

* fix flake8 issues

* rename test_bce_with_logits_gives_same_result_as_bce_and_sigmoid -> test_bce_with_logits_gives_same_result_as_sigmooid_and_bce_loss

* add warning in BCELoss about input shapes

* fix lint
2017-07-01 23:06:36 -04:00
da0fad8a7a Use torch.matmul in nn.Linear (#1935)
This takes advantage of the broadcasting behavior of torch.matmul to
support inputs with more than two dimensions. The extra dimensions are
treated like part of the batch dimension, much like nn.Bottle in Lua
Torch.

There are a few related small performance changes:

 * Addmm computes the gradient in column-major for inputs in
   column-major format
 * Variable.mm calls Addmm in-place with the desired output buffer
2017-06-30 16:53:26 -04:00
2c038f2074 Add weight normalization implementation (#1945)
* Add weight normalization implementation

This adds forward "pre-hooks" which get called before the module's
forward() method. Weight norm is implemented as a hook which calculates
the weight variable from the weight_g and weight_v every iteration.

Based on @rtqichen implementation.

* Specify return type
2017-06-30 15:41:40 -04:00
b3e500c522 fix docs generation warnings 2017-06-30 14:39:21 -04:00
b3f6ff1b3d Fix unused linker argument warnings. (#1958)
* Fix unused linker argument warnings.

This patch began when I noticed the following clang warning:

clang: warning: -Wl,-rpath,RIGIN: 'linker' input unused
clang: warning: argument unused during compilation:
'-L/home/ezyang/local/pytorch/torch/lib/tmp_install/lib'

The warning is minor, but I was a bit worried our rpath wasn't
setup correctly.  Actually, it was, and there wasn't a problem,
but I had to spend some time figuring out exactly what as going
on, and by the end of it, I might as well fix the warning.  In the end, I ended
up filing two upstream tickets for ccache and cmake:

- https://github.com/ccache/ccache/issues/189
- https://gitlab.kitware.com/cmake/cmake/issues/17025

We can remove the warning by using CMAKE_EXE_LINKER_FLAGS and
CMAKE_SHARED_LINKER_FLAGS, which have sane macro expansion rules
(although still slightly insane: the first level of escaping gets removed.)
To ensure that the rpath was being set correctly, I ran
objdump -x torch/lib/build/TH/libTH.so | grep RPATH and verified that ORIGIN
was setup correctly.

I also considered using CMAKE_INSTALL_RPATH, but the rpath here doesn't
seem to get set until you actually install, which is a change in behavior,
and I wasn't sure if anyone was relying on rpaths being setup in the build
directory.

There is a SLIGHT behavior change, in that if we happened to need these
LDFLAGS passed to the static linker, they won't get passed. I don't
think we ever build static libraries today so this shouldn't be aproblem.

P.S. Because of the ccache bug, you may continue to see these warnings
after this patch.  If you apply https://github.com/ccache/ccache/pull/190
and clear your cache, it will solve the problem.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Remove unnecessary -Qunused-arguments

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-30 14:15:31 -04:00
6df23b418d mark tools as excluded in find_packages (#1915) 2017-06-29 13:49:56 -04:00
e5b5154768 Make cudnn warnings clean. (#1940)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-29 10:58:04 -04:00
bfaddc0a19 Warp intrinsic fixes (#785) 2017-06-29 00:14:07 -04:00
4d5075add2 Add ignore_index to nnl_loss and cross_entropy (#1937) 2017-06-29 00:10:13 -04:00
0a95613cef Improve error message when accessing attributes that don't exist (#1936)
New:
   >>> torch.autograd.Variable(torch.randn(3, 3)).foobar
   AttributeError: 'Variable' object has no attribute 'foobar'

Old:
   >>> torch.autograd.Variable(torch.randn(3, 3)).foobar
   AttributeError: foobar
2017-06-28 20:13:15 -04:00
8a4eb50ed1 Speed up torch.matmul for 3D+ x 2D/1D tensors (#1931)
If the left tensor is 3D+ and the right tensor is at most 2D, we can
fold the batch into the matrix dimension and use torch.mm instead of
torch.bmm. In practice, this is faster especially if the right tensor is
column major.
2017-06-28 17:43:21 -04:00
b5e1df046e fixed typo in formula of GRU in doc (#1921) 2017-06-28 11:02:06 -04:00
08648061f7 Advanced Indexing 2A - Colons + Adjacent Adv Indexers (#1890) 2017-06-28 10:01:45 -04:00
4c35c630ec Enable norm gradgradchecks by lowering precision requirements. 2017-06-27 18:44:14 -04:00
3744efeaf8 Fix double backwards for prod. 2017-06-27 18:44:14 -04:00
bc032be13e Implement negative dimensions and double backwards cumprod. 2017-06-27 18:44:14 -04:00
f814a892cf done re-seed cuda device if in bad fork (#1923) 2017-06-27 13:24:52 -04:00
d592e188f7 port of ConcatDataset (#1902) 2017-06-27 12:31:56 -04:00
ae61f3ff42 adds poisson NLL loss (#1779) 2017-06-27 10:04:54 -04:00
1f391a42f7 fix warnings for docs generation 2017-06-27 00:18:32 -04:00
b933423495 support more than 8 gpus (#774) 2017-06-26 16:49:14 -04:00
ee1b7b50b3 fix docs for broadcast warning 2017-06-26 14:50:57 -04:00
7cdd018db4 Fix assertEquals for lists and tuples (#1913)
zip finishes once the first iterator is exhausted, so we were erroneously allowing things like assertEquals([1, 2], [1]) to pass.
2017-06-26 14:13:21 -04:00
7806a09f03 Fp16 fixes for CUDA 9 (#783) 2017-06-26 11:38:18 -04:00
7523c49f03 add missing INCREF 2017-06-26 11:33:16 -04:00
733a7c6d9a Fix segfault in SpatialDepthWiseConvolution w/o bias 2017-06-26 16:33:45 +02:00
32e666551a Fix lint. 2017-06-24 09:45:21 -04:00
ab0c321f80 Fix index_copy gradgrad test by ensuring indices cannot be repeated. 2017-06-24 09:45:21 -04:00
9db14936eb Ensure masked_select tests don't have masks of all zeros which yields
0-dimensional tensors.
2017-06-24 09:45:21 -04:00
e5857c5f1c Implement Gather double backwards. 2017-06-24 09:45:21 -04:00
7da77c4255 Add ScatterAdd autograd function. 2017-06-24 09:45:21 -04:00
656cb1c31a Implement and test double backwards for IndexCopy. 2017-06-24 09:45:21 -04:00
4ab4938cf0 Fix and test single backwards IndexCopy. 2017-06-24 09:45:21 -04:00
1324c4b081 Implement double backwards for masked_scatter. 2017-06-24 09:45:21 -04:00
bb3779efe8 Add broadcasting to masked_select. 2017-06-24 09:45:21 -04:00
7c24a3d5cf fix arguments for cudnnFindEx for transposed wgrad 2017-06-23 23:18:32 -04:00
194bc404b5 CUDA 9
Summary:
Adds basic CUDA 9 support, including adding Volta arch, and making appropriate modifications for half precision datatype changes
Closes https://github.com/facebookincubator/gloo/pull/49

Differential Revision: D5315336

Pulled By: pietern

fbshipit-source-id: 6468b0f357206d604bdcfec69ba82509a2c91407
2017-06-23 16:41:27 -07:00
a9ea975977 enable warnings in build and fix warnings 2017-06-23 11:49:09 -07:00
b1a84e3c70 update readme and add assign_(Scalar) variant 2017-06-23 11:27:55 -07:00
8a24f2b4d8 Fix segfault in SpatialDepthWiseConvolution w/o bias 2017-06-23 11:14:00 +02:00
66d93b60b3 fix a bug with scalar handling by simplifiying the maybeScalar check. 2017-06-22 23:07:56 -07:00
2af6ba3b2a handle select and operator[] style operations 2017-06-22 22:57:43 -07:00
b59b44fac7 add checks for scalars on output 2017-06-22 21:46:04 -07:00
a10a1c92b1 start adding rules to propagate scalar to results 2017-06-22 20:51:02 -07:00
bb6908e163 Scalar objects can now be backed by 0-dim Tensors. 2017-06-22 18:57:09 -07:00
c555cd8253 missing fixed allocator files 2017-06-22 18:32:10 -07:00
5e078bb7cc scalar flags added, and used to dispatch when there is a scalar variant of a function. broadcast annotations are used to figure out when a scalar s + A should also be converted. 2017-06-22 17:22:16 -07:00
ee10e7457f Corrected erroneous docstring for MultiLabelSoftMarginLoss 2017-06-22 17:42:18 -04:00
7cd6cc17af Merge commit '93e05eb458ad4c939e905668c1792692315880b0' 2017-06-22 17:23:02 -04:00
8bfef60b07 Merge commit '32fd4a3d6081a13c18ce4f8dcb37260a830a911f' 2017-06-22 17:22:31 -04:00
a45ad7cfba Advanced Indexing Part 1 -- Purely Integer Array Indexing 2017-06-22 17:21:50 -04:00
93e05eb458 Advanced Indexing Part 1 -- Purely Integer Array Indexing 2017-06-22 17:21:30 -04:00
32fd4a3d60 Advanced Indexing Part 1 -- Purely Integer Array Indexing 2017-06-22 17:21:19 -04:00
f09027bc29 Add batch sampler to DataLoader (#1867) 2017-06-22 20:18:31 +02:00
9a196829e2 Merge commit '43dec0a210103c4421bc73c7e742f0f746b7e39e' 2017-06-22 13:55:54 -04:00
43dec0a210 Remove THCTensor_(expand2) and THCTensor_(expand3).
They are no longer needed and the corresponding TH versions have been removed.
2017-06-22 13:55:08 -04:00
064ef8b81b Merge commit '104234a6a8937f09208061975ce90190a7be4159' 2017-06-22 13:21:59 -04:00
662faf7c41 Merge commit 'a940d4ff8bf5debc76d909a778e2e47d24148ee1' 2017-06-22 13:21:38 -04:00
cph
104234a6a8 add asserts to BCECriterion 2017-06-22 13:20:25 -04:00
cph
a940d4ff8b add asserts to BCECriterion 2017-06-22 13:20:07 -04:00
c16a268f47 Merge commit 'fb32164a72004e63ebfe1f9ca8366ff12f8fbec2' 2017-06-22 12:56:36 -04:00
cb4eaa9c5d TensorLib/Aten --> changes required in pytorch 2017-06-22 12:55:55 -04:00
fb32164a72 TensorLib/Aten --> changes required in pytorch 2017-06-22 12:55:17 -04:00
b5854a11c4 Merge commit 'eccc759c36a4023357c87fde79732e4c916676d2' 2017-06-22 12:49:50 -04:00
ddbd4ef4ac Support out-of-place broadcast type definitions. 2017-06-22 12:49:06 -04:00
eccc759c36 Support out-of-place broadcast type definitions. 2017-06-22 12:48:43 -04:00
fecd05ba2f Merge commit '81e14ad2dee356b2c2274eb302bc2438c9a6161a' 2017-06-22 12:46:37 -04:00
a7d1cd75ec Merge commit '93a7c9de29900f166486373744a0e90c7046a56a' 2017-06-22 12:46:02 -04:00
497db732fc btrifact: Make pivoting optional. 2017-06-22 12:45:14 -04:00
81e14ad2de btrifact: Make pivoting optional. 2017-06-22 12:45:01 -04:00
93a7c9de29 btrifact: Make pivoting optional. 2017-06-22 12:44:51 -04:00
96febbb762 Merge commit '62cfc94f445bfaeaccc3dcc1fc69ea5b75039823' 2017-06-22 12:40:40 -04:00
62cfc94f44 improving TH error messages in Apply macros 2017-06-22 12:38:10 -04:00
3f6cda8696 fix bug of threshold activation 2017-06-22 12:23:35 -04:00
a836f8f56f Use and document saved_variables for double backwards. 2017-06-22 11:46:24 -04:00
278cbbae49 set TH_INDEX_BASE to 0 2017-06-21 16:43:16 -07:00
68cbb857f2 allow tensors to be constucted from views of external data. Support creating new tensors that already have a size/stride 2017-06-21 15:35:08 -07:00
a1c557bc45 improve error reporting for undefined tensors passed as arguments. 2017-06-21 12:24:59 -07:00
4c5b7d41ba tensor.data<> also as toLongData() variants. Scalar now also has .to<T>() variants 2017-06-21 11:57:37 -07:00
13e7648fd1 document accessors 2017-06-21 11:23:03 -07:00
1572173ca7 Implement double backwards for Sort, Topk. 2017-06-21 00:24:13 -04:00
e16ceef76a Implement Scatter double backwards. 2017-06-21 00:24:13 -04:00
b79ff11aca Implement IndexAdd, IndexFill, IndexSelect, MaskedSelect double backwards. 2017-06-21 00:24:13 -04:00
50c0912a75 Implemented masked_fill double backwards. 2017-06-21 00:24:13 -04:00
c3ad55f746 add readme and generated files for Type/Tensor/Functions to a doc folder to make it possible to view headers without building the library 2017-06-20 20:33:26 -07:00
4b93f32234 rename TensorLib -> ATen 2017-06-20 16:49:13 -07:00
03f41c8120 fix capitalization of Python, make it consistent 2017-06-21 00:09:37 +02:00
e0b70d0f64 Fix Fmod/Remainder gradgradcheck by ensuring inputs requires_grad. 2017-06-20 11:59:21 -04:00
0b2b7d0594 Kth value function passes gradgradcheck. 2017-06-20 11:59:21 -04:00
6d97ac0c0f Missing includes in cuda_collective_device.h
Summary: Closes https://github.com/facebookincubator/gloo/pull/47

Differential Revision: D5283752

Pulled By: pietern

fbshipit-source-id: 8ad3353b3455c5416e31e75b46755e2f7fcaad52
2017-06-20 08:54:16 -07:00
a405efa756 CUDA collectives as alternative to NCCL
Summary:
Adds a separate set of CUDA collectives that run on device as an
alternative to NCCL. Use these collectives as default on-device
collectives instead of NCCL.

Whenever multiple processes on the same machine use Gloo with NCCL and
end up doing concurrent CUDA memory allocations and algorithm
execution, we risk deadlock. A follow up change will enable opt-in
usage of NCCL (e.g. through environment variable).

Benchmark output below with varying number of elements. It shows a
minor improvement over using NCCL for local reduction and broadcast.

Number of elements equal to on-device threshold (256K):

```
Device:      tcp, pci=0000:25:00.0, iface=eth0, speed=50000
Algorithm:   cuda_allreduce_ring
Options:     processes=2, inputs=8, gpudirect=no

        elements   min (us)   p50 (us)   p99 (us)   max (us)    samples
(before)  262144       2685       2907       3035       3215        562
(after)   262144       2682       2874       3013       3395        577

Device:      tcp, pci=0000:25:00.0, iface=eth0, speed=50000
Algorithm:   cuda_allreduce_ring_chunked
Options:     processes=2, inputs=8, gpudirect=no

        elements   min (us)   p50 (us)   p99 (us)   max (us)    samples
(before)  262144       2045       2133       2325       2643        725
(after)   262144       1533       1673       1834       2048        800

Device:      tcp, pci=0000:25:00.0, iface=eth0, speed=50000
Algorithm:   cuda_allreduce_halving_doubling
Options:     processes=2, inputs=8, gpudirect=no

        elements   min (us)   p50 (us)   p99 (us)   max (us)    samples
(before)  262144       1580       1640       1718       2069        893
(after)   262144       1371       1446       1539       1748       1125
```

Larger number of elements (4M):

```
Device:      tcp, pci=0000:25:00.0, iface=eth0, speed=50000
Algorithm:   cuda_allreduce_ring
Options:     processes=2, inputs=8, gpudirect=no

        elements   min (us)   p50 (us)   p99 (us)   max (us)    samples
(before) 4194304      55543      58058      60103      62659         32
(after)  4194304      54490      57923      60893      66058         33

Device:      tcp, pci=0000:25:00.0, iface=eth0, speed=50000
Algorithm:   cuda_allreduce_ring_chunked
Options:     processes=2, inputs=8, gpudirect=no

        elements   min (us)   p50 (us)   p99 (us)   max (us)    samples
(before) 4194304      18049      22820      24997      26634        105
(after)  4194304      18356      20463      21695      22589         99

Device:      tcp, pci=0000:25:00.0, iface=eth0, speed=50000
Algorithm:   cuda_allreduce_halving_doubling
Options:     processes=2, inputs=8, gpudirect=no

        elements   min (us)   p50 (us)   p99 (us)   max (us)    samples
(before) 4194304      18584      24345      27809      29722         95
(after)  4194304      19541      22718      25408      26688         88
```

Reviewed By: akyrola

Differential Revision: D5278192

fbshipit-source-id: 53f09e404663ddc8bb46d06ac87afd8ee3ffc3a2
2017-06-20 00:23:43 -07:00
67968cb60b Add numerically stable BCELoss which takes logits as input (#1792) 2017-06-19 22:05:51 -04:00
a6c5e3f2e2 Fix case where interface doesn't have an address
Summary:
Code in tcp/transport tries to find the network interface a socket was
bound to when create a TCP device context. Per getifaddrs(3), it is
possible for the ifa_addr field to be NULL (supposedly when an
interface doesn't have an address). Ignore such entries.

Thanks to slayton58 for reporting this.

Reviewed By: wesolwsk

Differential Revision: D5279376

fbshipit-source-id: 039380b95ba4d6d94942c30581e0b230a060870c
2017-06-19 18:05:32 -07:00
6ee6b4980b multiple docs 2017-06-19 20:06:27 -04:00
ceb13c8cc3 Don't propagate -mavx flag to dependents
Summary:
Previously, `gloo/math.h` inlined methods which use AVX builtins,
which required propagating the `-mavx` flag.
This diff moves these definitions out of the header and into a source
file to prevent avoid this.

Reviewed By: pixelb

Differential Revision: D5271043

fbshipit-source-id: dde4dc560dfb557b46d1a582a8b38e7cb8eb0c37
2017-06-19 16:46:43 -07:00
82ef292f00 Add gradgradchecks for various autograd Functions and support Unfold double backwards. 2017-06-19 18:19:16 -04:00
76ee014d10 Add documentation to SELU and AlphaDropout 2017-06-19 18:18:01 -04:00
f619ac6ac9 Quickfix for AlphaDropout on CUDA 2017-06-19 18:18:01 -04:00
32e6372538 Split cuda_collectives.h into two files
Summary:
This changes prepares for having a separate set of collectives that
use native CUDA calls instead of NCCL. This is needed to workaround
the issue where NCCL deadlocks when it is interleaved with CUDA memory
management operations in other processes on the same machine.

Includes a modification to the host reduction functions to bring them
up to parity with the NCCL reduction functions (they now incorporate
offset/counter arguments).

Reviewed By: wesolwsk

Differential Revision: D5276291

fbshipit-source-id: 8844731760d2c48577d207c026ce0cd641f2fc6d
2017-06-19 12:57:53 -07:00
172a356668 forgotten import in variables.py
Fixing error on line 661: 
warnings.warn("masked_copy_ is deprecated and renamed to masked_scatter_, and will be removed in v0.3")
NameError: name 'warnings' is not defined
2017-06-19 14:23:48 +02:00
329a2f7d27 Prevent divide by zero in dropout with p=1 2017-06-17 11:38:02 -04:00
69e38ee821 clean test code, no functional change 2017-06-17 11:11:48 -04:00
38e6b9c7e7 fix bug in wrap_outputs miscounting the number of inputs 2017-06-17 11:11:48 -04:00
7775e9e777 add newNarrow to thpp THCTensor 2017-06-17 11:11:48 -04:00
293262b8f1 fix cuda tests 2017-06-17 11:11:48 -04:00
e66e01a2a0 remove extra computations for input usage check 2017-06-17 11:11:48 -04:00
0a93903e8e move tests to test_nn 2017-06-17 11:11:48 -04:00
bcac55dd2f force 1 stride for 1-sized dim for cudnn, fix lint, remove extra unpacking 2017-06-17 11:11:48 -04:00
6cdcd9c603 Add Narrow function
clean error message and support non perfectly sized inputs
2017-06-17 11:11:48 -04:00
075030d974 add cuda tests that use only cunn for finite difference computations 2017-06-17 11:11:48 -04:00
23dec70614 comment on working values for epsilon 2017-06-17 11:11:48 -04:00
fc0ab229ad remove extra cloning and add contiguous calls 2017-06-17 11:11:48 -04:00
ce3bc5a4a5 force cloning of weights 2017-06-17 11:11:48 -04:00
3dbece7eb5 clean tests 2017-06-17 11:11:48 -04:00
bd94718c87 cleaner AccumulateGrad 2017-06-17 11:11:48 -04:00
2f8d21a7f2 add contiguous function 2017-06-17 11:11:48 -04:00
4f4fc9091a add support for newTranspose in thpp::THCTensor 2017-06-17 11:11:48 -04:00
7ee095cf7f add newExpand and newView to thpp::Tensor 2017-06-17 11:11:48 -04:00
462ab8a644 add Transpose View Expand C functions 2017-06-17 11:11:48 -04:00
dd5c7c473f Add ConvBackwardBackward class 2017-06-17 11:11:48 -04:00
6dca309017 make AccumulateGrad support no input gradient 2017-06-17 11:11:48 -04:00
f945fbc3dd add gradgradcheck and conv double backward tests 2017-06-17 11:11:48 -04:00
db70d4d223 1) Simplify CompareOp autograd backward
2) Use better approach for avoiding divide-by-0 in autograd tests.
2017-06-17 09:38:28 -04:00
7714b5a088 Fix autograd shape tracking for 1-d reduction ops. 2017-06-17 09:38:28 -04:00
860f51e67f Avoid nans in fmod/remainder tensor tests.
Also clean up CompareOp autograd backwards impl.
2017-06-17 09:38:28 -04:00
2c04ce63a5 Fix masked_scatter autograd broadcasting. 2017-06-17 09:38:28 -04:00
83bfa5e1ab Fix masked_scatter pointwise autograd backward behavior. 2017-06-17 09:38:28 -04:00
618f20fb38 Fix autograd broadcasting for masked_fill. 2017-06-17 09:38:28 -04:00
9711223c12 Add broadcast autograd tests for dist. 2017-06-17 09:38:28 -04:00
7d0f1c51bb Fix autograd broadcast for min, max. 2017-06-17 09:38:28 -04:00
7560474fbb Fix autograd pointwise fallback for max,min. 2017-06-17 09:38:28 -04:00
e69fe5bdb0 Automatically detect when to skip inplace tests and fix lint. 2017-06-17 09:38:28 -04:00
f3ae90e329 Fix broadcast and pointwise compare ops with autograd. 2017-06-17 09:38:28 -04:00
bfdd1f2199 Fix fmod/remainder autograd broadcasting. 2017-06-17 09:38:28 -04:00
b164efb8b0 Fix lerp broadcast autograd. 2017-06-17 09:38:28 -04:00
94c7260087 Fix pointwise fallback for lerp. 2017-06-17 09:38:28 -04:00
aac459431b Fix pow autograd broadcast. 2017-06-17 09:38:28 -04:00
a04d1af0a4 Fix addr, addmm, baddmm, addmvm, addbmm broadcasting with autograd.
Fix autograd broadcast for addmm, baddmm, others.
2017-06-17 09:38:28 -04:00
a54a7c1312 Fix addcmul, addcdiv autograd broadcasting. 2017-06-17 09:38:28 -04:00
9ba799c26b Fix pointwise fallback for addcdiv, addcmul. 2017-06-17 09:38:28 -04:00
5cfb1329b5 Make implementation of Variable.mul_ and Variable.div_ consistent. 2017-06-17 09:38:28 -04:00
af2dd0d3e9 Fix autograd for broadcasting with add, sub, mul, div. 2017-06-17 09:38:28 -04:00
79a343bbd4 Remove unnecesssary squeezing in Expand backwards.
Also add size checks to test_autograd to try to catch such issues.
2017-06-17 09:38:28 -04:00
88e4bec8fa resize bug fix 2017-06-17 11:07:22 +02:00
faa7c2cc2c fix cuda breakage 2017-06-16 20:13:46 -04:00
3cecdf84f1 Storage from_file method (#1821) 2017-06-17 00:34:20 +02:00
49586d9556 Add basic API support for NCCL 2.0
Summary:
\cc pietern
Minimal changes to allow gloo to compile and run with NCCL 2.0
Closes https://github.com/facebookincubator/gloo/pull/46

Differential Revision: D5268074

Pulled By: pietern

fbshipit-source-id: 58d625d57b31cfc932f3dbbdd7a4b83d9a2e60a8
2017-06-16 15:22:14 -07:00
8d33603901 make t() of Variable consistent with Tensor (#1823) 2017-06-16 16:08:53 +02:00
a64560c22e Remove flattening for torch.dot (#1781) 2017-06-16 02:15:33 +02:00
97f50edf46 Add documentation for Cholesky lapack functions (#1816) 2017-06-16 02:10:56 +02:00
86a96cd759 Merge commit 'd605afe8b51bf1522d3caf4efef4b3c85def499b' 2017-06-15 12:33:45 -04:00
f61ec2495e nn.EmbeddingBag to compute a bag of word embeddings (Embedding + Sum/Mean) 2017-06-15 12:32:47 -04:00
d605afe8b5 nn.EmbeddingBag to compute a bag of word embeddings (Embedding + Sum/Mean) 2017-06-15 12:32:28 -04:00
909f31764f Add nn.padding to docs fixes #1127 (#1808)
* exposed nn.padding modules

* using functional
2017-06-15 07:41:38 -04:00
ea5819045e a few comments in build_all.sh (#1807) 2017-06-14 17:58:56 -04:00
9c53c6dcb9 Fix errors and warnings when building docs (#1806) 2017-06-14 13:50:14 -04:00
9d916e561c batch norm docfix (#1804)
fixes the formula for batch normalization (moves the epsilon inside
the square root)
2017-06-14 11:57:46 -04:00
4e356528b4 Add torch.matmul function. (#1780)
* Add torch.matmul function.

Includes test_torch, test_autograd and docs changes.

* Add __all__ to functional so imports are accidentally imported.

* Include unbind in __all__.

* Add matmul case for when one argument is 1-dimensional and the other
at least 3-dimensional.

* Add squeeze_ to Variable.

* Use squeeze_ instead of squeeze for matmul.
2017-06-14 08:14:53 -04:00
9fd354e643 More accurate build instructions based on @apaszke's comments. (#1800) 2017-06-14 12:04:45 +02:00
c8e9bc493b Merge commit '244af06adc77674e7e1134d67d4a56ae7641f7b9' 2017-06-13 20:49:37 -04:00
6de5ce6bac Merge commit '1cf105d517c4308912eee85eff8f50f31c9e31f1' 2017-06-13 20:49:13 -04:00
38b9598685 Added GLU (gated linear unit)
From https://arxiv.org/abs/1612.08083
2017-06-13 20:48:19 -04:00
244af06adc Added GLU (gated linear unit)
From https://arxiv.org/abs/1612.08083
2017-06-13 20:48:03 -04:00
1cf105d517 Added GLU (gated linear unit)
From https://arxiv.org/abs/1612.08083
2017-06-13 20:47:55 -04:00
3ada9da808 Make csrc -Werror clean. (#1795)
Primary things I had to fix:

- Suppress _XOPEN_SOURCE warnings by ensuring that Python.h is included
  first, because it always unconditionally defines this macro.

- Turn off strict aliasing, because Python 2 doesn't work with strict
  aliasing.

- Workaround setuptools bug, where it's incorrectly passing
  -Wstrict-prototypes to C++ compilers (where this doesn't make
  any sense)

To compile csrc with -Werror, run `CFLAGS="-Werror" python setup.py build_ext`

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 20:18:09 -04:00
5a63a6d47f Better document how to rebuild only parts of the project. (#1796)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 17:23:39 -04:00
38a48729f0 Merge commit '1a6995b28ca42df41270d4fd914adfb9c8c59674' 2017-06-13 16:31:48 -04:00
deb0aef30c Merge commit '122dd9e8ec4627ccdd895a7dc88a1ec6f13ad6d2' 2017-06-13 16:31:13 -04:00
3977ee3520 Support device on sparse tensor constructor, assert values/indices on same device.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:35 -04:00
c0e7bda3f1 Enforce storage is not NULL invariant for sparse tensors.
Fixes #1783.

There is an undocumented invariant in PyTorch that we should
try to avoid having storage == NULL as much as possible (even
though Torch supports it.)  This commit properly documents the
invariant, and fixes a bug in sparse where the invariant was
not respected.  This now means that sparse tensors now correctly
remember what GPU they are associated with.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:35 -04:00
df412051fd Add comment stating nDenseTensors != nTensors in checkGPU.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:35 -04:00
7bee03fe1e Do NOT clone indices/values passed to sparse tensor by default.
Fixes #1782.

The default operation should be cheap: user can always choose to
explicitly make a copy on the way in.  Note that this is a
BACKWARDS COMPATIBILITY BREAKING change.  However, we DO create
a new tensor wrapper (so we are not affected by subsequent
size changes, etc.)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:34 -04:00
865beada0e Add comment about new implementation being CPU-only.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:34 -04:00
6a46863c83 Abort on known bug (#1521) for spcadd on non-coalesced.
It's better to error than to silently give wrong results.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:19 -04:00
d763db59a9 More efficient nnz test in spcadd.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:19 -04:00
5d6e593c67 Test clone preserves uncoalescedness if it wasn't coalesced.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:19 -04:00
bac408b693 Add some docs about storage->Size.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:19 -04:00
2f967a204c Sparse tensor clone() preserves coalescedness.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:30:19 -04:00
1a6995b28c Short-circuit copy if src and dest are equal.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:20:04 -04:00
122dd9e8ec Short-circuit copy if src and dest are equal.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-13 16:19:35 -04:00
7c024e93c6 Implement Cumprod function for autograd (#1439) 2017-06-13 17:48:15 +02:00
b4698d6d1d add init to __init__.py of torch.nn (#1789) 2017-06-13 09:02:30 -04:00
d9d50f80c7 Rename arguments to distributed collectives 2017-06-12 22:02:11 -04:00
714351ff39 Officially enable process-group mode 2017-06-12 22:02:11 -04:00
6f51b4ce2d Fix deadlock in GlooCache 2017-06-12 22:00:22 -04:00
12813b88f6 Add DistributedDataParallel 2017-06-12 22:00:22 -04:00
23ab9d481a Add Module._all_buffers 2017-06-12 21:58:38 -04:00
8db8716c7c Support non-default streams in NCCL reduce 2017-06-12 21:58:38 -04:00
b37f18be53 Free GIL when entering THD functions 2017-06-12 21:58:38 -04:00
5a0d5ec058 Add more checks in torch.distributed 2017-06-12 21:58:38 -04:00
095ddc7d08 THD updates and bug fixes
* Add keepdim
* Fix DataChannel signature
* Fix incorrect locking
* Use current stream in DataChannelGloo
2017-06-12 21:58:38 -04:00
86a065e45b Add end callbacks to the engine 2017-06-12 21:58:38 -04:00
59d438de2e change function to remove dependence on CUDA 8.0
Summary: Replace call to function that is only supported in CUDA 8.0 with one that has been supported in previous releases.

Reviewed By: pietern

Differential Revision: D5231755

fbshipit-source-id: d72aec2a4a1c511064a65142887f8a05b51dad55
2017-06-12 15:53:59 -07:00
6626881e7a Add Alpha Dropout (#1775) 2017-06-13 00:39:49 +02:00
49ec984c40 Ensure warnings are repeated in python2 for tests. 2017-06-11 05:37:59 -04:00
afaad94fed Rename autograd keepdim tests that now default to True. 2017-06-11 05:37:59 -04:00
4f602a52b5 Use THPUtils_assert rather than THError in torch/csrc/Module. 2017-06-11 05:37:59 -04:00
3abc8be42c Clarify use of warn vs raise in expand_utils and don't catch exception in Broadcast plugin when fallback = false. 2017-06-11 05:37:59 -04:00
f4ce99fd87 Add dist, atan2, lerp to fallback functions.
They weren't documented as having those semantics, but tests on
master show they do.
2017-06-11 05:37:59 -04:00
d5a0f97ea7 Renamed masked_copy to masked_scatter in test, fix use of break/continue. 2017-06-11 05:37:59 -04:00
e8ec4110f6 Fix Prod backward for broadcasting. 2017-06-11 05:37:59 -04:00
ffd808768e Remove raiseErrors from THTensor functions, have THStorage functions take an error_buffer to return a proper error message while being able to handle memory management correctly from calling function. 2017-06-11 05:37:59 -04:00
5b81746767 Simplify python warning settings and cleanup tests. 2017-06-11 05:37:59 -04:00
d49b73bbe6 Rename check_fallback to check_backincompat_expand_warn for clarity. 2017-06-11 05:37:59 -04:00
7040b82ede Change async/broadcast copy arguments to be parsed as ints. 2017-06-11 05:37:59 -04:00
723819014e Move expand_utils-inl.h to generic/ and generate via macros. 2017-06-11 05:37:59 -04:00
1ef4cc1591 Incorporate review comments:
1) Line up trailing dimensions in broadcast docs.
2) remove unnecessary expand_as in common_nn test.
3) use view in tensor_str instead of resize_.
4) newExpand remove raiseErrors change.
5) clarify expandedSizes/expandedStrides parameters in inferExpandGeometry.
6) simplify inferSize2/inferSizeN implementations.
7) use new-style classes for warning.
2017-06-11 05:37:59 -04:00
deec86cc05 Clarify a number of comments. 2017-06-11 05:37:59 -04:00
7da46097fe Fix lint errors. 2017-06-11 05:37:59 -04:00
21d9b0c9dd Ensure warnings are repeated in test, necessary in python2. 2017-06-11 05:37:59 -04:00
69287250d1 Add a broadcast parameter to copy_, use it in the library in cases where there is non-broadcasting calls exposed by the tests. 2017-06-11 05:37:59 -04:00
74a23c5aba Fix test_broadcast for cuda tensors, since map_, map2_ not implemented. 2017-06-11 05:37:59 -04:00
177785eecf explicit Ptr constructors, fast transposed copy. 2017-06-11 05:37:59 -04:00
ad9604f45a Add documentation for copy_. 2017-06-11 05:37:59 -04:00
65b23f146e Add broadcasting support for copy_, simplify code generation by moving a lot of currently generated code to expand_utils. 2017-06-11 05:37:59 -04:00
c54e532954 Add broadcasting support for map_, map2_. 2017-06-11 05:37:59 -04:00
ec120fac0c Add broadcasting support for masked_copy, masked_fill. 2017-06-11 05:37:59 -04:00
e06523482a Use THSize_isSameSizeAs, instead of THTensor_(isSameSizeAs) in order to compare sizes of tensors with different data types. 2017-06-11 05:37:59 -04:00
d6fb92fec9 Improve in-place broadcasting back compat warning message and fix an issue where the deprecated warning would not be printed. 2017-06-11 05:37:59 -04:00
5e1a714386 Add backwards incompatibility docs. 2017-06-11 05:37:59 -04:00
be65f46c76 Add optional warning for backwards incompatible keepdim. Setting torch.utils.backcompat.keepdim.warning.enabled=True will cause Python warnings in the case where the default value of keepdim is used for 1-d reductions.
Also specify keepdim via kwargs in library so these warnings have less
noise.
2017-06-11 05:37:59 -04:00
3556d1b8a3 Add optional warning for backwards incompatible broadcast.
Setting torch.utils.backcompat.broadcast.warning.enabled=True
will cause Python warnings in the case where broadcast occurs
but previously 1-d view style pointwise ops occured.
2017-06-11 05:37:59 -04:00
5af46cb352 Add broadcasting support for matmul. 2017-06-11 05:37:59 -04:00
a36f95fe26 Add broadcast support for fused-matmul broadcasting. Functions are: addmm, addbmm, addr, addmv, baddbmm. 2017-06-11 05:37:59 -04:00
cd35091d9b Include simple broadcasting example and demonstrate lining up trailing dimensions. 2017-06-11 05:37:59 -04:00
3c586d196a Document Broadcast Plugin. 2017-06-11 05:37:59 -04:00
8e2f347951 Proof that broadcasting 3 args (expand3) is equivalent to breaking up operation. 2017-06-11 05:37:59 -04:00
d279c6e099 Docs for addcdiv, addcmul 2017-06-11 05:37:59 -04:00
014372e707 Support "fused" ops: addcmul/addcdiv. 2017-06-11 05:37:59 -04:00
92fde6cf06 Breakup in place broadcast to better handle multiple arguments. 2017-06-11 05:37:59 -04:00
b44ea57ba8 Change order of Broadcast specification.
Since fused ops require broadcasting self over multiple other arguments,
it is simpler to specify broadcast on self rather than the other
way around.
2017-06-11 05:37:59 -04:00
e96f854ce2 Implement/test broadcasting semantics for comparison ops. 2017-06-11 05:37:59 -04:00
edf2969bd8 Backwards compatible Spatial Normalizations / CrossMapLRN. 2017-06-11 05:37:59 -04:00
e653fe2857 Test fixes for keepdim=False, suppress warnings on backwards-compatible behavior. 2017-06-11 05:37:59 -04:00
70c33777a6 pow, fmod, remainder also should fallback.
This behavior isn't listed in the docs, but the tests depend on it.
2017-06-11 05:37:59 -04:00
471dfe9791 Add documentation including links to numpy broadcasting semantics. 2017-06-11 05:37:59 -04:00
85d838a028 Testing over the following: 1) CPU tensor out-of-place functions 2) CPU tensor in-place functions 3) GPU tensor out-of-place functions 4) GPU tensor in-place functions 5) torch. functions 6) Fallback semantics (use pointwise nElem matching rather than broadcasting) 2017-06-11 05:37:59 -04:00
6a40acb4f0 Add Broadcast plugin. 2017-06-11 05:37:59 -04:00
9087624634 Revert "Restore examples with keepdim=True default."
This reverts commit 6fab62173e842bbf550de1c68cfae507ca35b800.
2017-06-11 05:37:58 -04:00
e772a440cb Revert "Change keepdim default to False."
This reverts commit e124790cb2b6675a4b6edf64620a7eb7f7228b29.

Note the original commit message is incorrect; this changes keepdim
back to false.
2017-06-11 05:37:58 -04:00
efd8b54be2 Merge commit 'e45c1046feba46aef2ffac1b1d978a3e76936bab' 2017-06-11 05:37:51 -04:00
54c3441e9c Merge commit '7d1b042cb2198d2bdb5871b08c6c0fb2ccc8e6b1' 2017-06-11 05:37:18 -04:00
7d1b042cb2 fix type 2017-06-11 04:42:34 -04:00
e45c1046fe Remove raiseErrors from THTensor functions, have THStorage functions take an error_buffer to return a proper error message while being able to handle memory management correctly from calling function. 2017-06-11 04:33:54 -04:00
a563ce1105 Incorporate review comments:
1) Line up trailing dimensions in broadcast docs.
2) remove unnecessary expand_as in common_nn test.
3) use view in tensor_str instead of resize_.
4) newExpand remove raiseErrors change.
5) clarify expandedSizes/expandedStrides parameters in inferExpandGeometry.
6) simplify inferSize2/inferSizeN implementations.
7) use new-style classes for warning.
2017-06-11 04:33:54 -04:00
92d52bf395 Add broadcasting support for copy_, simplify code generation by moving a lot of currently generated code to expand_utils. 2017-06-11 04:33:54 -04:00
0463ddf16b Support "fused" ops: addcmul/addcdiv. 2017-06-11 04:33:54 -04:00
9060e6be7f Remove raiseErrors from THTensor functions, have THStorage functions take an error_buffer to return a proper error message while being able to handle memory management correctly from calling function. 2017-06-11 04:32:08 -04:00
f0b8c4821b Incorporate review comments:
1) Line up trailing dimensions in broadcast docs.
2) remove unnecessary expand_as in common_nn test.
3) use view in tensor_str instead of resize_.
4) newExpand remove raiseErrors change.
5) clarify expandedSizes/expandedStrides parameters in inferExpandGeometry.
6) simplify inferSize2/inferSizeN implementations.
7) use new-style classes for warning.
2017-06-11 04:32:08 -04:00
0f79bf1a69 Clarify a number of comments. 2017-06-11 04:32:08 -04:00
503002eda7 Add broadcasting support for copy_, simplify code generation by moving a lot of currently generated code to expand_utils. 2017-06-11 04:32:08 -04:00
cf55e1e48a Add broadcasting support for masked_copy, masked_fill. 2017-06-11 04:32:08 -04:00
8d35d4215b Use THSize_isSameSizeAs, instead of THTensor_(isSameSizeAs) in order to compare sizes of tensors with different data types. 2017-06-11 04:32:08 -04:00
9356640453 Properly clean up expand error cases. 2017-06-11 04:32:08 -04:00
ae6b8d0112 Include simple broadcasting example and demonstrate lining up trailing dimensions. 2017-06-11 04:32:08 -04:00
ec2f6a81fd Support "fused" ops: addcmul/addcdiv. 2017-06-11 04:32:08 -04:00
1f9a365fdc Add Infer Size N, for expansion of fused operations. 2017-06-11 04:32:08 -04:00
d38a87217f Expand improvements
1) Rename calculateExpandGeometry to inferExpandGeometry for consistency
2) Simplify inferExpandGeometry implementation by using a single pass
   through dimensions
3) Implement a two operand expansion, expand2.
4) Implement versions that return error code to use for fallback to
equal nElem support.
2017-06-11 04:20:04 -04:00
baa4ba973b Expand improvements
1) Rename calculateExpandGeometry to inferExpandGeometry for consistency
2) Simplify inferExpandGeometry implementation by using a single pass
   through dimensions
3) Implement a two operand expansion, expand2.
4) Implement versions that return error code to use for fallback to
equal nElem support.
2017-06-11 04:19:37 -04:00
a24db91a38 Add SELU activation function (#1769)
* Add SELU activation function

* Remove unnecessary case

* Add Function for SELU + tests and fix RReLU inplace

* Fix extra line in doc

* Fix tests

Remove in-place tests for RReLU. For some reason they fail on legacy nn, but passes on nn

* SELU in new-style Function

It also supports double backprop, verifyed with gradgradcheck

* Fix flake8
2017-06-11 10:07:48 +03:00
e3d5826b92 Add Cumsum double backwards support. (#1758) 2017-06-10 18:27:44 +02:00
ba690d5607 Add support for NVTX functions. (#1748) 2017-06-10 18:26:58 +02:00
5f1a16a018 Torch manual seed to seed cuda devices (#1762) 2017-06-10 12:37:21 +02:00
dcf07a2d7f Fix typo in ParameterList documentation 2017-06-10 02:16:52 +02:00
fab5bef9f6 Merge pull request #45 from slayton58/nccl_cmake_fix
Fix NCCL directory typo
2017-06-08 11:28:25 -07:00
21a5c8ea5e Fix use of nccl_INCLUDE_DIRS in nccl.cmake 2017-06-07 20:13:11 -04:00
5300aafc1f Fix NCCL directory typo 2017-06-07 17:01:13 -04:00
a9bd1de9e9 fixed README to reflect docker image name (#1751) 2017-06-07 15:49:39 -04:00
e57eef4bcb Merge commit '62835fc3f5346968b4dca392c77efdeb75a6b172' 2017-06-07 14:54:47 -04:00
d81da41650 Make sure the number of MKL and OpenMP threads match
Otherwise, on many machines, the size of the OpenMP thread pool will
change between MKL and our OpenMP enabled functions. The constant thread
creation and destruction results in worse performance and leaks memory
on GCC 5.4
2017-06-07 14:53:29 -04:00
62835fc3f5 Make sure the number of MKL and OpenMP threads match
Otherwise, on many machines, the size of the OpenMP thread pool will
change between MKL and our OpenMP enabled functions. The constant thread
creation and destruction results in worse performance and leaks memory
on GCC 5.4
2017-06-07 14:53:14 -04:00
da7957c660 Fix masked_copy call to masked_scatter. (#1749) 2017-06-07 12:58:47 -04:00
2a49353d5e minor fix for docs of Upsample 2017-06-07 11:42:52 -04:00
b05c23de44 Merge commit 'da45b4c6b3b0b7cd8f0dc612b9afa6a3a07b8305' 2017-06-07 11:31:38 -04:00
019e967113 Merge commit '47bf87b9220c10edaafec98c6bd20bdb1436c8e4' 2017-06-07 11:30:35 -04:00
b9ab26765e Add 3D upsampling (nearest and trilinear) with tests 2017-06-07 11:29:27 -04:00
da45b4c6b3 Add 3D upsampling (nearest and trilinear) with tests 2017-06-07 11:24:41 -04:00
47bf87b922 Add 3D upsampling (nearest and trilinear) with tests 2017-06-07 11:24:05 -04:00
edd41d8d80 BatchNorm fallback to THNN when eps < CUDNN_BN_MIN_EPSILON (#1742) 2017-06-07 09:56:28 -04:00
352f8b2fa6 Merge commit 'ced01f6c919c4b7109512ce797a2a0185c8f8112' 2017-06-07 09:22:14 -04:00
ced01f6c91 fix GRUFused signature 2017-06-07 09:21:20 -04:00
d351239c10 fix legacy ClassNLLCriterion for upstream change 2017-06-07 00:38:00 -04:00
1b1579c89d Merge commit 'b96f76e470b25454b6b14c7ace888686295405e9' 2017-06-07 00:19:42 -04:00
df7c47142d fix for THNN NLLLoss signature change 2017-06-07 00:18:11 -04:00
b96f76e470 standalone macros 2017-06-07 00:17:05 -04:00
7e62971c86 Merge commit '71ccedbc6c4e460d38c794737bba780e7673e888' 2017-06-06 23:38:52 -04:00
a7d987544d Merge commit '4e49aed5eaa5a4abaf0a51bb87a49b44394ea3c3' 2017-06-06 23:35:42 -04:00
4e49aed5ea fix outputHeight <-> outputWidth 2017-06-06 23:33:51 -04:00
71ccedbc6c Merge pull request #470 from qqning/master
Fix the mix-up of height and width on depth-wise convolution
2017-06-06 23:31:54 -04:00
c3cda260b6 Merge commit '64faf120acb97866dfd90bf428b385deee4ee912' 2017-06-06 23:27:45 -04:00
22949350b6 More performant fix for fused rnn kernels (#1532) and bugfix (#1721) 2017-06-06 23:25:31 -04:00
3f7b48ccda Remove clone in fused rnn 2017-06-06 23:20:14 -04:00
db620304b2 More performant fix for fused rnn kernels (#1532) and bugfix for #1721 2017-06-06 23:13:07 -04:00
d7db75c10f added CosineSimilarity to nn.distance and updated docs (#1672)
* added CosineSimilarity to nn.distance and updated docs
2017-06-06 22:53:21 -04:00
e50d599240 Fix header inclusion in math.h
Summary:
While debugging #43 I found common/common.h missing some headers as well.

Fixes #43.
Closes https://github.com/facebookincubator/gloo/pull/44

Differential Revision: D5194970

Pulled By: pietern

fbshipit-source-id: 4861cd04c56931d4759f5bc050816788252003ee
2017-06-06 15:21:08 -07:00
c6a6391c38 added checks to cudnn Convolution for stride, dilation, kernel size and num input planes (#1723)
* added checks to cudnn Convolution for stride, dilation, kernel size and num input planes
2017-06-06 15:42:00 -04:00
d50ad408fa fix incorrect grad_weight in Bilinear 2017-06-06 15:07:09 -04:00
73ccdb3920 Fixing the issue with incorrect normalized values in IndexLinear 2017-06-06 11:44:11 -07:00
b6c75c43c8 add tests for checking the type of .data and .grad.data is the same 2017-06-06 01:06:14 -04:00
a53cde09b5 Rename masked_copy_ to masked_scatter_ 2017-06-06 01:06:14 -04:00
98afdcf409 Accept None values returned from grad hooks 2017-06-06 01:06:14 -04:00
ef32e96447 Fix grad type of compare functions 2017-06-06 01:06:14 -04:00
b032b88f34 Fix Prod backward and autograd tests 2017-06-06 01:06:14 -04:00
a76098ac15 fix optimizer when given single parameters (instead of an iterable)
When I use the named_parametes to modify the lr and weight decay, I will face a bug. Because the value of the named_parameters return is  torch.nn.paramter.Parameter, not a generator of the Parameter.
2017-06-05 23:47:56 -04:00
2ce5875a4d Modify the sample code of extending autograd (#1720)
The original input can not be used as input of Linear(), because forward() takes at least 3 arguments (2 given)
2017-06-05 23:36:58 -04:00
511cb20e7d Add Gesv to autograd (#1733)
* Add Gesv to autograd

* Add TODO for backprop through LU
2017-06-05 21:38:49 -04:00
e3305eb9dc Runtime dockerfile (#1732)
* reduce the size of Docker image

* add runtime dockerfile
2017-06-05 17:40:06 -04:00
e9bf702c5e LSTM bias_hh, fix docs
Rename W_hi ... to b_hi ...
2017-06-05 22:55:09 +02:00
9a2d11dd36 Use a longer timeout when establing initial tcp connection
Summary: Machines may not create their Gloo pairs at the same time, due to earlier variable time work. Increase the timeout used to establish the initial tcp connection to accommodate without sacrificing the shorter default timeout for outstanding reads/writes. No related change required for ibverbs as there is no communication on init.

Reviewed By: akyrola

Differential Revision: D5184518

fbshipit-source-id: 0e6c9704a2d2f1406b3927f75887f0a42199450b
2017-06-05 13:40:22 -07:00
3716286e6b reduce the size of Docker image (#1729) 2017-06-05 14:03:11 -04:00
c357ebd590 Merge commit '6422ea3d9f065683bb899b88ae0baec79e6d73ca' 2017-06-05 13:01:25 -04:00
85a95d8a23 Fix sharing of CUDA tensors on non-current devices
The correct device must be set when getting the base allocation and when
calling cudaIpcCloseMemHandle. Store the device in the allocators
context, which was previously always NULL.

Fixes #1707
2017-06-05 13:01:19 -04:00
6422ea3d9f Fix sharing of CUDA tensors on non-current devices 2017-06-05 12:58:34 -04:00
ddf6328990 Document type function returns type with no args (#1719) 2017-06-05 11:54:55 -04:00
174c3cc399 Add support for double backward of LeakyReLU (#1714) 2017-06-05 11:53:27 -04:00
24aecaa2c8 Cleanup torch vision docs (#1699)
* Modify torchvision documentation following https://github.com/pytorch/vision/pull/179

* Add new datasets to docs

* Fix wording in torch.datasets

* Small clarification
2017-06-05 11:52:41 -04:00
4853cc0194 convert linalg.py to new-style functions (#1638) 2017-06-04 09:27:01 -04:00
ac1c674723 Fix a couple of selection reduce function autograd bugs (#1702)
* Fix Median/Mode autograd functions.

* Fix kthvalue autograd function.

* Double backward for selection reduce functions.
2017-06-03 02:12:15 -04:00
eba3dc8561 Fix gc_refs assertion failure (#1705)
* Fix gc_refs assertion failure

Ensure that each THPVariable -> THPFunction reference contributes one
ref count to the THPFunction by creating a new shared_ptr for each ref.

Because multiple shared_ptrs can again manage a single THPFunction, it's
not safe to use std::weak_ptr where it may point to a PyFunction. It's
still safe to use weak_ptr for grad_accumulator since these are never
PyFunctions.

Fixes #1626

* Remove stale comment
2017-06-02 21:08:50 -04:00
ee9d4d58e2 Fix connect bug
Before the change, processes were not waiting for master even when they got
'connection refused' (master is not listening yet, so we should wait).
It was because we were closing socket twice: first, by
the resource guard; second, manually in exception handler.
That caused errno to be set to different value (9 - bad file descriptor)
and in result `if`, which checked if connection was refused, was failing.
2017-06-02 23:42:11 +02:00
b7c4900d19 Fix minor bug in InitMethodFile 2017-06-02 23:42:11 +02:00
e22f9036de Add tcp init method for non-multicast addresses 2017-06-02 23:42:11 +02:00
c01ff1f3dc Make world_size mandatory for Master and Worker; Minor refactor 2017-06-02 23:42:11 +02:00
eeb8e5c31b Linux fixes 2017-06-02 23:42:11 +02:00
c6c9e61169 Implement THD tensor copies 2017-06-02 23:42:11 +02:00
34804e9600 Refactor file and tcp init methods
* Add sanity checks
 * Refactor InitMethodFile and TCPInitMethod to more logical functions
 * Update few error messages
 * Add passing parameters by **kwargs, so now order of parameters is not relevant
 * Review comments
2017-06-02 23:42:11 +02:00
c41555fb0a Add rank parameter; Fix MW mode initalization 2017-06-02 23:42:11 +02:00
96cc1e1ac7 Review comments 2017-06-02 23:42:11 +02:00
cfdd49f76a Simplify and refactor init code 2017-06-02 23:42:11 +02:00
447d9287bf Refactor multicast and change env init method 2017-06-02 23:42:11 +02:00
832eaf900b Fix bugs and improve init methods 2017-06-02 23:42:11 +02:00
e685277299 Add address discovery; Bug fixes; 2017-06-02 23:42:11 +02:00
8ea7c87c29 Improve init methods 2017-06-02 23:42:11 +02:00
09c0d9c51c Add multiple initalization methods for DataChannels 2017-06-02 23:42:11 +02:00
240384605c Make copy functions thread safe (#82) 2017-06-02 23:42:11 +02:00
9f9a3d596f Use lock_guard and don't use unique_ptr 2017-06-02 23:42:11 +02:00
a8c26c1040 Add mutexes to MasterCommandChannel::sendMessage 2017-06-02 23:42:11 +02:00
6cdfe0d7b9 Remove MASTER_ADDR and _PORT from MPI benchmarking 2017-06-02 23:42:11 +02:00
1b66b50064 Benchmarks: Don't export WORLD_SIZE when using MPI
I just realized we don't need it (any longer?).
2017-06-02 23:42:11 +02:00
cf42c1a044 Improve error messages of DataChannel::newChannel 2017-06-02 23:42:11 +02:00
f717f29d7e Change function names; Change thpp::Tensor to THDTensorDescriptor 2017-06-02 23:42:11 +02:00
181d2f41bd Add initial Python wrappers for THDTensors 2017-06-02 23:42:11 +02:00
2059ece284 Exit workers gracefully in master-worker mode 2017-06-02 23:42:11 +02:00
b3e100b40e Add copy (TH <-> THD) functions to MW mode 2017-06-02 23:42:11 +02:00
ec2de16776 Improve README copyediting 2017-06-02 21:02:14 +02:00
ea05d6aec3 Fix compilation with cuDNN 5 (#1703) 2017-06-02 14:03:02 -04:00
5a93d6b903 Fix CUDA_HOME detection (#1675) 2017-06-02 19:26:00 +02:00
75e0df271a Add Inverse to autograd (#1670)
* Add Inverse to autograd

* Add SkipTest to autograd tests
2017-06-02 12:00:13 -04:00
565bf7116b A pile of misc doc fixes. (#1682)
* A pile of misc doc fixes.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Handle @apaszke  review comments.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Initial csrc documentation.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-06-02 11:59:03 -04:00
f1c57ace1b added input dim checks to convxD and conv_transposedxd (#1695)
* add input dim check for conv2d

* add None check to conv2d

* added input dim checks to convxD and conv_transposedxd

* flake8 fixes
2017-06-02 11:58:19 -04:00
460b8715a8 display version number in docs 2017-06-02 11:56:48 -04:00
6da111c53d Merge commit '00843c57c936720b3d17f4c0afaab08dcb52a7cc' 2017-06-02 11:52:19 -04:00
568c5c91ee substitute cudnnFind* functions with cudnnFind*Ex 2017-06-02 11:52:12 -04:00
00843c57c9 substitute cudnnFind* functions with cudnnFind*Ex 2017-06-02 11:50:50 -04:00
501467db17 added param name to tuple_parser for better error messages 2017-06-02 16:16:21 +02:00
d51cd61e2e add checks for input, weight and bias types when using cudnn conv2d (#1689) 2017-06-01 10:06:30 -04:00
447fe953e5 Modify the sample code of volatile (#1694)
The original two inputs (torch.randn(5,5)) can not be used as input of resnet, which must be (batch, channels, width, height)
2017-06-01 09:46:04 -04:00
7b5af7d1b7 Expand ibverbs read timeout messages
Summary: TSIA

Reviewed By: romain-intel

Differential Revision: D5158642

fbshipit-source-id: 6e55a69a140c1f5f6e4ce6262afaf5014c412414
2017-05-31 19:50:21 -07:00
afc26ac675 Added time-out to ibverbs transport
Summary: Extended the time-out option from just working on TCP to also working with ibverbs

Reviewed By: pietern

Differential Revision: D5090258

fbshipit-source-id: fee685850d761d0c2130852f513c64ceb19f4e9e
2017-05-31 11:20:40 -07:00
6f791e74f1 Add a minimum iteration count of 1 for benchmarks
Summary:
For some long running benchmarks, the iteration count could be 0
which would lead to a segfault when printing results

Reviewed By: pietern

Differential Revision: D5149034

fbshipit-source-id: 7b56e8961c302d1ff11ffcd74ca8e909ea046231
2017-05-30 18:12:39 -07:00
3106423713 Synchronize with H2D copyAsync before signalling the broadcast sender
Summary: Closes https://github.com/facebookincubator/gloo/pull/41

Differential Revision: D5149996

Pulled By: pietern

fbshipit-source-id: 15d61fab9babfeb1e4178b84ecf5f6e32ad3bfb3
2017-05-30 14:20:29 -07:00
4eb448a051 Fix simple typo
Dimension a bit wrong
2017-05-28 18:53:04 +02:00
065c59860a Fix docs: masked_fill_ takes a value, not a tensor. (#1663) 2017-05-26 14:41:03 -04:00
45f665d05c Fix decodeUInt64BE
Fixes #1658
2017-05-26 11:21:31 -07:00
64faf120ac Adding support for ADD_TORCH_LIBRARY macro 2017-05-25 15:41:52 -07:00
0b74f0d796 lua 5.3 changes and gcc constants 2017-05-25 15:41:52 -07:00
8074180081 Faulty error message for InstanceNorm1d (#1609) 2017-05-25 17:13:01 -04:00
5ce4a4adbf Merge commit '3f1f3f97343d2ab7eb522cac7330f6b7478bd4da' 2017-05-25 16:51:57 -04:00
3e9caed731 Merge commit 'bd705d38ce11a0ca1547f709f29f80a02b3dd894' 2017-05-25 16:51:09 -04:00
7b578dd68e Add scatterAdd 2017-05-25 16:49:48 -04:00
3f1f3f9734 Add scatterAdd 2017-05-25 16:49:32 -04:00
bd705d38ce Add scatterAdd 2017-05-25 16:49:22 -04:00
630af4d7d8 add learning rate schedulers (#1370) 2017-05-25 16:21:43 -04:00
0409b42a02 Merge commit '3abe5c80d2073f0e72f79b88f11b2a9d320fb116' 2017-05-25 15:40:27 -04:00
c39d48ea7d Fast transposed copy 2017-05-25 15:39:21 -04:00
3abe5c80d2 Fast transposed copy 2017-05-25 15:39:07 -04:00
05bc877a05 make THPPointer have explicit constructors (#1636) 2017-05-25 15:35:54 -04:00
7ea9d9af4e Fix build when included by another project; take 2
Summary:
Only adding `include_directories` doesn't propagate to the including
targets. Also use `target_include_directories` to do so.
Closes https://github.com/facebookincubator/gloo/pull/39

Differential Revision: D5131001

Pulled By: pietern

fbshipit-source-id: 6c58c4b76ae7fa008e4fb26d1bca7900165884d0
2017-05-25 11:50:23 -07:00
6a7c56499c How to manage multiple build trees of PyTorch. (#1654)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-25 11:21:52 -04:00
46ee1e4687 Clarify definition of gather function in docs. (#1652) 2017-05-25 11:06:28 -04:00
e63b49d9ab Fix build when included by another project
Summary:
The CMake variable CMAKE_BINARY_DIR points to the top level build
directory. For standalone Gloo builds this path lets files include the
generated file "gloo/config.h". When Gloo is included as project, this
variable points to a different path and "gloo/config.h" cannot be
resolved. Fix is to build a path from CMAKE_CURRENT_BINARY_DIR.
Closes https://github.com/facebookincubator/gloo/pull/38

Differential Revision: D5129385

Pulled By: pietern

fbshipit-source-id: 722cebf4892b34f869fe43320153efbb181555b6
2017-05-25 07:50:53 -07:00
036c3f93af Check for released variables in SavedVariable::unpack() (#1648)
Fixes #1288
2017-05-25 00:35:19 -04:00
4f261f5730 Add support for fast float16 reductions using AVX
Summary: Using Misha's vectorized AVX code to greatly improve performance of reductions on float16 values. Float16 reductions are now 2x faster than float.

Reviewed By: pietern

Differential Revision: D5123331

fbshipit-source-id: 03d4e76886d538b7e24eedaf32a92231a80b1e43
2017-05-24 21:20:06 -07:00
98581b9f7e Fix conv1d segfault when weight doesn't require grad (#1646)
Fixes #1600
2017-05-24 20:46:32 -04:00
9a497f824b Add size/dimensionality documentation for torch.gather. (#1645) 2017-05-24 20:42:18 -04:00
1e63a04a18 Use clear-to-send notification for broadcast algorithms
Summary:
The broadcast algorithms use the buffers they were given directly.
There is no inbox/outbox pattern. This means that we can race if the
algorithm is run repeatedly within a short time frame. This hasn't
been an issue so far since we've only used it in combination with
other process wide barriers.

Since this adds a round trip the latency of these ops from the root
rank perspective increases. The variance between the before and after
runs is pretty high since there is no back and forth interaction on
the root. It simply waits for recipients to be ready and then sends
its data.

Before:

```
Device:      tcp, pci=0000:25:00.0, iface=eth0, speed=50000
Algorithm:   broadcast_one_to_all
Options:     processes=4, inputs=1

   elements   min (us)   p50 (us)   p99 (us)   max (us)    samples
        100          1         16         29         50     426075
        200          2         17         32         50     179953
        500          2         11         31         59     140291
       1000          2         12         29         59     177619
       2000          3         12         29         62     117882
       5000          5         16         31         64     127113
      10000          9         21         38         88      60328
      20000         19         36         65        130      30427
      50000         48         68        221        556      11180
     100000         92        136        426        871       7314
     200000        193        251        829       2965       4092
     500000        492        638       2098       4133       1677
    1000000       1195       2024       3513      11646        628
    2000000       3446       4216       5007      17100        282
    5000000      12956      13919      14941      37751         71

```

After:

```
Device:      tcp, pci=0000:25:00.0, iface=eth0, speed=50000
Algorithm:   broadcast_one_to_all
Options:     processes=4, inputs=1

   elements   min (us)   p50 (us)   p99 (us)   max (us)    samples
        100         15         37         52        107      27332
        200         14         40         63        199      28620
        500         17         37         52        118      18299
       1000          9         39         57        120      33375
       2000         20         57         78        180      24779
       5000         31         61         84        190      18039
      10000         39         70         90        225       8908
      20000         57        108        130        940       8313
      50000         94        163        217       1933       5326
     100000        132        231        331       3501       3681
     200000        256        426        560       6509       2272
     500000        774       1092       1698      10039        985
    1000000       1132       2106       3878      18218        484
    2000000       3509       4252       6832      20228        226
    5000000      11326      15447      27129      52694         77
```

Reviewed By: wesolwsk

Differential Revision: D5123341

fbshipit-source-id: f3bab4f75ef7c38817f74f00b382f18fe43d85d5
2017-05-24 15:36:36 -07:00
e54112758c Fix potential vector out of range issue in ContextFactory::makeContext
Summary: Vector out-of-range error was being triggered in some tests due to trying to get the address of an element past the end of vector.

Reviewed By: pietern

Differential Revision: D5123044

fbshipit-source-id: 004f72ebaa27c609290959c12a3d99b16289bfa8
2017-05-24 14:50:09 -07:00
e1d257bc6d Fix segfault in autograd: (#1644)
* Fix segfault in autograd:

1) Every "output" variable must have a grad_fn or grad_accumulator
2) compute_partial_exec_callbacks uses Python errors

* assertRaisesRegexp was renamed assertRaisesRegex in 3.2

* Use HANDLE_TH_ERRORS macro
2017-05-24 17:13:08 -04:00
3d38e4f126 Acquire GIL before THPVariable_wrap (#1625)
* Acquire GIL before THPVariable_wrap.

* mutex not required when GIL is held.

* Remove unused mutex.
2017-05-24 15:19:34 -04:00
fa93653d09 Improve handling of graph roots in autograd engine (#1635) 2017-05-24 14:50:07 -04:00
ff047fdeef Fix the mix-up of height and width on depth-wise convolution 2017-05-24 21:05:08 +08:00
2486a6bbd0 Add missing header file types.h in CMakeLists.txt
Summary: A recently added header file was missing in CMakeLists.txt

Reviewed By: pietern

Differential Revision: D5116962

fbshipit-source-id: 6c3fbd4b49c913f20308c1b057a7e09806e0c2b0
2017-05-23 16:50:41 -07:00
640846b864 Fix race in ibverbs transport
Summary:
In a previous commit where the slot numbering was expanded, I changed
the memory region send/recv path to use a map for the outgoing memory
regions (since they may complete out of order). Before, this was a
fixed size array, which was mutated by both the user thread and device
thread without holding a lock. The map, however, can't be mutated
without a lock. This change adds that lock and a few assertions to
check for this type of problem.

Reviewed By: andrewwdye

Differential Revision: D5108194

fbshipit-source-id: 1908c988112469ecdec6cb6eb9849068d896c409
2017-05-23 15:38:48 -07:00
ba56de1150 add coding UTF-8 declaration 2017-05-23 16:02:34 -04:00
6e3e453ad2 Tidy up convs docs (#1602) 2017-05-23 18:32:33 +02:00
f5d919a685 Generate config.h file with compilation options
Summary:
This file can then be used by downstream code to figure out what Gloo
features it can support (e.g. ibverbs transport or not).
Closes https://github.com/facebookincubator/gloo/pull/36

Differential Revision: D5110769

Pulled By: pietern

fbshipit-source-id: 2c0c07537258048737ae764a4978f2f7fdbd992d
2017-05-23 09:26:03 -07:00
02e4ca9cab fix wrapper 2017-05-23 08:43:13 -07:00
70a774898e Remove superfluous forward declaration
Summary: ContextFactory is no longer mentioned in gloo/context.h.

Reviewed By: romain-intel

Differential Revision: D5110328

fbshipit-source-id: 48dd020dc39d71d0d5f72deebfa5d80122b70c0d
2017-05-23 08:20:55 -07:00
49befe3fcd Remove commPairs_ member variable from halving/doubling
Summary: TSIA

Reviewed By: wesolwsk

Differential Revision: D5110348

fbshipit-source-id: d3346e2af1a9f13410dc93336c53040a29e22e66
2017-05-22 21:21:42 -07:00
7eac2073b8 Add notification mechanism to ContextFactory
Summary:
This is another example where our unsolicited writes may interfere
across calls to the collective function. In this case, it was possible
for a second call to overwrite a pair's address before it had been
used to connect the pair in the previous iteration.

Thinking out loud, we could avoid this from happening by supporting
this pattern natively in the Buffer classes. For example, we can add a
notification mechanism (opt in) to the Buffer class such that the
receiver may call `ackRecv()` to acknowledge receipt and handling of
the data in the buffer. Then the sender will block on new sends until
acknowledgement from the previous send has been received. Until then,
we have to keep an extra eye out.

Reviewed By: wesolwsk, romain-intel

Differential Revision: D5095430

fbshipit-source-id: 4c100433108fccea7457bba4dc00f651f722e6c9
2017-05-22 19:50:18 -07:00
45524ec33c Fix indices bug in MM.py (#1613) (#1617) 2017-05-22 16:47:51 -04:00
f072c74dfd make it effective to transfer a tensor from other devices to device 0 (#1610) 2017-05-22 11:06:57 -04:00
107a0fe9ac Revert "Revert "ClassNLLCriterion supports missing targets"" 2017-05-21 13:48:19 -04:00
2acfb2376a fixes eval mode in InstanceNorm (#1604)
fixes https://github.com/pytorch/pytorch/issues/1541
2017-05-21 13:27:48 -04:00
0c5598c668 Update build status matrix 2017-05-21 12:20:50 +02:00
feaee29bfe Add argmax and argmin to docs 2017-05-20 18:56:20 +02:00
7f6cd7c7ea Fix error message in CUDA forked subprocess (#1585)
We need to re-call _lazy_init in _CudaBase.__new__ in the subprocess.
2017-05-19 12:36:08 -04:00
625850c2c2 Check cuDNN version at runtime (#1586)
* Check cuDNN version at runtime

This checks that the version from cudnn.h matches the version from
libcudnn.so.

Fixes #1476

* Only check major and minor version numbers
2017-05-19 01:55:09 -04:00
9b3447761a Check for required non-None arguments in C++ autograd functions (#1589) 2017-05-19 01:47:35 -04:00
ed679fc43c disabling fd leakchecker test (#1593) 2017-05-19 01:20:50 -04:00
e6c9509a41 Fix call to Tensor.set_ in rnn.py (#1592) 2017-05-18 20:28:49 -04:00
c57f0530e7 let long_args False for param "size" of set_ (#1568)
* fix #1524, let long_args False for param "size" of set_
2017-05-18 19:31:36 -04:00
8021bb938c Remove slot number limitation from ibverbs transport
Summary:
The pair was still hardcoding limits on the slot numbers. In this
change those limits are lifted.

This also adds back assertions on work completion status in
handleCompletion.

Reviewed By: wesolwsk

Differential Revision: D5090457

fbshipit-source-id: 7bf884e1f31e48e8f1cdfb179a225999e28171b2
2017-05-18 16:20:40 -07:00
1f4317be3f Add support for half-precision floating point operations
Summary: Add support for collectives over vectors of half-precision floating point values.

Reviewed By: pietern

Differential Revision: D5062938

fbshipit-source-id: 0b39fa53370393fec1edf2d852ff7f1d862b9022
2017-05-18 15:09:06 -07:00
cba46a4869 Assert that we don't do out of bound writes on recv
Summary:
The halving/doubling algorithm had two instances where a receive
buffer was registered with a number of elements instead of a number of
bytes. This change adds the assertion that should have caught this in
the first place.

Reviewed By: wesolwsk

Differential Revision: D5089483

fbshipit-source-id: fd0f0724ef04300236c9297ee88b27e61fb1e5a0
2017-05-18 14:34:39 -07:00
b391f53681 Cache send/recv buffers in ContextFactory
Summary:
The original implementation created temporary buffers on the backing
context. This also meant an ordering problem when using the ibverbs
transport, as a call to send will block until the remote side has
created its receive side buffer. Since all buffers are now created
prior to using them, this is no longer an issue.

Reviewed By: romain-intel

Differential Revision: D5082352

fbshipit-source-id: 4c260f06e8f461c0336e7eec7ca891e07ff41cd3
2017-05-18 10:20:42 -07:00
85732b52ec fix cuda multiple algorithm test
Summary: Fixing a bug in the multiple algorithm test where threads were spawned repeatedly, causing collisions during rendezvous.

Reviewed By: pietern

Differential Revision: D5082945

fbshipit-source-id: 4adbbc963b1ff652f73a44cd9fd75dcd3325f182
2017-05-17 16:35:25 -07:00
156fe28666 dataloader can now handle growing datasets (#1575) 2017-05-17 19:23:15 -04:00
2f4bf4ab39 Rewrite 'How autograd encodes the history' to accurately describe current setup. (#1580)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-17 19:21:20 -04:00
1f3ff5ced2 Miscellaneous documentation around autograd. (#1577)
* Miscellaneous documentation around autograd.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-17 19:19:24 -04:00
b8b7f879c2 .gitignore updated with editor temporaries (#1574) 2017-05-17 19:16:02 -04:00
7b10b16496 Move ibverbs buffer send logic to pair.cc
Summary:
TSIA

This matches the approach in the TCP transport where all send/recv
logic is contained in the pair code.

Reviewed By: wesolwsk

Differential Revision: D5082503

fbshipit-source-id: b70886ed9aaeb381cdb45fba00704118cff62a23
2017-05-17 15:54:34 -07:00
da86633c7c Additional synchronization in halving/doubling
Summary:
This is necessary to avoid the next iteration of the algorithm
overwriting data in recvBuf_ before it has been consumed by the
receiver of that data. If this does happen, the result of the previous
iteration for the receiving end is corrupted. This can only happen in
async mode on the TCP transport (so all incoming data is unsolicited)
when spinning on the run function.

Reviewed By: wesolwsk

Differential Revision: D5074789

fbshipit-source-id: 66668fbd885888f26266d812e78d61c6d65c2461
2017-05-17 15:21:09 -07:00
c573d53939 Bug fixes (#1573)
* Fix clang warnings
* Raise errors when unsupported ConvNd configurations are used
* Properly handle Variable indexing with LongTensors
* Support both tensors and variables in Variable.type_as
2017-05-17 15:28:16 -04:00
cb79c24d0b Added powerpc64le support (#1572) 2017-05-16 08:30:06 -06:00
caa1cdf0ce ClassNLLCriterion ignoreIndex 2017-05-15 22:27:00 -04:00
368ecb47f9 Fix flaxy test_sparse_adagrad (#1562) 2017-05-16 01:03:08 +02:00
6107d15d14 Twice differentiability of pointwise functions (#1531) 2017-05-15 12:00:59 -06:00
ba885a1a51 expose bitwise operators from C/CUDA (#1556)
* fix issue #1549, expose bitwise and

* expose C bitwise or of Tensor

* expose C bitwise xor of Tensor

* use built-in method for inplace and, or, xor

* expose C bitwise lshift(ilshift) and rshift(irshift) of Tensor
2017-05-15 11:36:15 -06:00
ce1a0eb6c9 Merge commit '7afd78d77ffad503357c35f495ae6d4d2b008862' 2017-05-15 11:20:27 -06:00
7afd78d77f Cuda reduce in a consistent direction 2017-05-15 11:18:20 -06:00
6b84dc26f0 Add F.cosine_similarity (#1502) 2017-05-15 11:12:54 -06:00
0f458ee3c4 Fix memory leak in THCSTensor_spcadd. (#1519)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-15 11:11:03 -06:00
8aa011f52a minor typo and style changes to _torch_docs.py (#1559) 2017-05-15 15:32:56 +02:00
2a610c9d13 Revert "Update to ignore zero targets" 2017-05-14 18:15:30 -07:00
ac8b2c0fa3 Revert "ClassNLLCriterion supports missing targets" 2017-05-14 18:14:36 -07:00
0ba20435ce Add high order grad support for Some operator (#1507) 2017-05-14 23:02:04 +02:00
6fc9130052 Adapt documentation to reflect new supported argument (#1548)
Reflect the changes of #1323
2017-05-13 21:09:34 -06:00
28f4f6db2c typo error for torch.addr (#1547)
fix the typo error in the example for torch.addr
2017-05-13 08:53:05 -07:00
9b2de027be SpatialDepthWiseConvolution.cu added 2017-05-12 16:02:14 -04:00
bf4345e2ef ClassNLLCriterion supports missing targets 2017-05-12 15:15:39 -04:00
029290c5b1 SpatialDepthWiseConvolution 2017-05-12 11:34:27 -04:00
78abf0134d Merge pull request #458 from jnhwkim/master
Update to ignore zero targets
2017-05-12 10:38:18 -04:00
9db7787316 Updating __getitem__ and __len__ for containers (#1544) 2017-05-12 16:17:06 +02:00
efa913b1c2 fix uninitialized variable in cmake FindSSE (#1023) 2017-05-11 18:57:34 -07:00
d1a4467682 fix a bug when calling modules
a module that returns a non-standard data structure currently breaks
due to checks for backwards hooks. This refactors the code slightly so
this will only break in the event of backwards hooks.
2017-05-11 23:00:45 +02:00
507ddc4cde Temporary fix for multiple backwards with fused pointwise RNN (#1540) 2017-05-11 11:18:56 -07:00
aba05ce9db Ensuring float tensors call float versions of math functions 2017-05-11 10:39:35 -07:00
be843eb26b Add unfold to autograd (#1523) 2017-05-11 17:53:16 +02:00
5bb13485b8 Fix Linear function 2017-05-10 16:43:14 +02:00
a86adf43a1 Fix comparison functions 2017-05-10 16:43:14 +02:00
1c304a9ef6 Expose variable attribute of AccumulateGrad 2017-05-10 16:43:14 +02:00
feef54ec34 Don't modify non-volatile grads in zero_grad 2017-05-10 16:43:14 +02:00
5026209d0c Minor fix in Prod backward 2017-05-10 16:43:14 +02:00
e7220380bc Add new flags to Variable.backward 2017-05-10 16:43:14 +02:00
9fa0e403d6 Replace retain_variables with retain_graph 2017-05-10 16:43:14 +02:00
35cf380ed1 Improve output wrapping logic in autograd 2017-05-10 16:43:14 +02:00
3a7e068439 Remove spurious memo argument in Module.parameters() (#1527) 2017-05-10 13:55:15 +02:00
862105ec8b Merge commit 'd5e821044aa20d67122f4570a3f1cb7e6e9c2617' 2017-05-09 17:06:25 -07:00
d5e821044a Make torch.cat not synchronize the host and device 2017-05-09 17:05:23 -07:00
bfc8a3ebba Reference counting documentation. (#1520)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-09 17:02:28 -07:00
6fab62173e Restore examples with keepdim=True default. 2017-05-09 14:49:55 -07:00
c4742fd128 Explicitly pass keepdim=False for tests that require it.
If we change the default to False, reverting this commit is optional.
2017-05-09 14:49:44 -07:00
e124790cb2 Change keepdim default to False. 2017-05-09 14:49:21 -07:00
171638a451 Fix test_normalize NN test. 2017-05-09 14:25:06 -07:00
d95f711501 Add a keepdim test to torch_test. 2017-05-09 14:25:01 -07:00
b9e00dfbb8 Make (non-legacy) nn backwards compatible.
The keepdim change only seems to leak in one place:
when the grad_bias is returned in linear.py.
2017-05-09 14:24:53 -07:00
f6a00fac13 Add autograd tests for keepdim 2017-05-09 14:24:45 -07:00
be5191a00b Add documentation for keepdim. 2017-05-09 14:16:42 -07:00
c9d8e0a43a Change all legacy/nn modules to use keepdim=True (even if tests don't fail).
We shouldn't be introducing changes in legacy modules if we can avoid it.
2017-05-09 14:16:31 -07:00
ae2b2cbbec Make keepdim work with autograd. 2017-05-09 14:15:59 -07:00
f4cf1d6d18 Merge commit 'af790f86f329364dacef1301fc9b5b292629075c' 2017-05-09 14:04:08 -07:00
c34cff7035 Merge commit '906c550e1079e9762194db59440a202ffca90dca' 2017-05-09 14:03:28 -07:00
194d7408bb Merge commit '5f308b50fb558a620253443ef45f7cf3a91be410' 2017-05-09 14:02:25 -07:00
0d538246fb Merge commit '98dbdc464b0f53ecc89af58cc994c7e8d7617e4e' 2017-05-09 14:01:13 -07:00
7c3cb24485 Add a keepdim parameter for reduction functions over a single dimension.
By default, this parameter is False -- a backwards incompatible change, but
one that follows numpy semantics, e.g. numpy.sum (numpy names the parameter
"keepdims" since you can pass multiple dims to reduction functions).

The old behavior seems desired for normalization type operations
where the tensor will immediately be expanded out again, e.g.:
probs.sum(1).expand_as(probs)
which no longer works because the dimension to expand is missing.
This can be fixed by simply passing True as "keepdim" argument
to the reduction operation, e.g:
probs.sum(1, keepdim=True).expand_as(probs)
2017-05-09 14:01:03 -07:00
af790f86f3 Add a keepdim parameter for reduction functions over a single dimension.
By default, this parameter is False -- a backwards incompatible change, but
one that follows numpy semantics, e.g. numpy.sum (numpy names the parameter
"keepdims" since you can pass multiple dims to reduction functions).

The old behavior seems desired for normalization type operations
where the tensor will immediately be expanded out again, e.g.:
probs.sum(1).expand_as(probs)
which no longer works because the dimension to expand is missing.
This can be fixed by simply passing True as "keepdim" argument
to the reduction operation, e.g:
probs.sum(1, keepdim=True).expand_as(probs)
2017-05-09 11:55:42 -07:00
906c550e10 Add a keepdim parameter for reduction functions over a single dimension.
By default, this parameter is False -- a backwards incompatible change, but
one that follows numpy semantics, e.g. numpy.sum (numpy names the parameter
"keepdims" since you can pass multiple dims to reduction functions).

The old behavior seems desired for normalization type operations
where the tensor will immediately be expanded out again, e.g.:
probs.sum(1).expand_as(probs)
which no longer works because the dimension to expand is missing.
This can be fixed by simply passing True as "keepdim" argument
to the reduction operation, e.g:
probs.sum(1, keepdim=True).expand_as(probs)
2017-05-09 11:55:29 -07:00
5f308b50fb Add a keepdim parameter for reduction functions over a single dimension.
By default, this parameter is False -- a backwards incompatible change, but
one that follows numpy semantics, e.g. numpy.sum (numpy names the parameter
"keepdims" since you can pass multiple dims to reduction functions).

The old behavior seems desired for normalization type operations
where the tensor will immediately be expanded out again, e.g.:
probs.sum(1).expand_as(probs)
which no longer works because the dimension to expand is missing.
This can be fixed by simply passing True as "keepdim" argument
to the reduction operation, e.g:
probs.sum(1, keepdim=True).expand_as(probs)
2017-05-09 11:55:20 -07:00
98dbdc464b Add a keepdim parameter for reduction functions over a single dimension.
By default, this parameter is False -- a backwards incompatible change, but
one that follows numpy semantics, e.g. numpy.sum (numpy names the parameter
"keepdims" since you can pass multiple dims to reduction functions).

The old behavior seems desired for normalization type operations
where the tensor will immediately be expanded out again, e.g.:
probs.sum(1).expand_as(probs)
which no longer works because the dimension to expand is missing.
This can be fixed by simply passing True as "keepdim" argument
to the reduction operation, e.g:
probs.sum(1, keepdim=True).expand_as(probs)
2017-05-09 11:54:58 -07:00
e70164316c Merge commit '91a118c116d15d280a99a39666d298be15c6d592' 2017-05-08 16:58:56 -07:00
33b3968660 add larger tests for qr 2017-05-08 16:58:54 -07:00
91a118c116 Fix bug in magma qr decomposition and add tests for larger matrices 2017-05-08 16:44:15 -07:00
0764589ed1 Merge commit '008a8c9720183d7bf8b00bf64d8d21c62270089f' 2017-05-08 16:24:14 -07:00
27671c800d Merge commit '105df5844dca21f964d180a918c808489862941f' 2017-05-08 16:23:12 -07:00
d0504aa41d Implement lgamma function. 2017-05-08 16:21:26 -07:00
008a8c9720 Implement lgamma function. 2017-05-08 16:20:52 -07:00
105df5844d Implement lgamma function. 2017-05-08 16:20:39 -07:00
50bf7d5cbc Merge commit '066fbcd014fa4092152b2cd04ad1d92fc8d7bd59' 2017-05-08 16:13:57 -07:00
066fbcd014 use current stream in cat array kernel launch 2017-05-08 16:12:10 -07:00
ecf29f10ad Merge commit '22bbd7ac33ba51469cc913cb01fcd3b70a42e528' 2017-05-08 16:10:00 -07:00
22bbd7ac33 s/IndexType/long 2017-05-08 16:09:02 -07:00
2075abbe30 Gloo: Added a way to create connected contexts from another context
Summary:
Added a context factory that allows you to use an existing context to
create other fully connected contexts much more cheaply (without having
to rely on a store).

Limitations:
  - The backing context needs to be fully connected

Reviewed By: andrewwdye, pietern

Differential Revision: D4985121

fbshipit-source-id: 31ceabccbb679cedb18ec9927b6c166bef5989bb
2017-05-08 16:02:04 -07:00
e694db0eeb Raise error when Variable is converted to bool. Fixes #1482. (#1491) 2017-05-08 23:14:11 +02:00
c5ae79fe4e Make clamp twice differentiable (#1514) 2017-05-08 23:12:42 +02:00
4ad2e155bc Make nn.Sequential more pythonic (#1510)
A minor fix which uses `enumerate` during iteration.
2017-05-08 07:32:07 -07:00
6d693fe413 Add F.normalize (#1467) 2017-05-07 13:54:16 +02:00
23b556ef77 Expose custom attributes from C++ functions (#1430) 2017-05-07 13:49:55 +02:00
e3f41a4962 Add high order gradient support for Sigmoid (#1496) 2017-05-07 13:00:20 +02:00
90e9f8a476 Avoid segfault when calling join_with with self as arg (#1493) 2017-05-07 00:35:11 +02:00
5f15a9e0cb Add a note about THPFunction_asFunction 2017-05-06 14:28:32 -07:00
ff0ff33a11 Fix docs for InstanceNorm (#1477) 2017-05-04 18:11:15 -04:00
eb2c6ea874 set deviceId_ to -1 when CudaDevicePointer and CudaStream do not have valid data
Summary: Set deviceId_ to -1 when CudaDevicePointer and CudaStream do not have valid data

Reviewed By: andrewwdye

Differential Revision: D4881374

fbshipit-source-id: e973a70e2e6e4519f5fdc2ad4e76f232d9593751
2017-05-04 15:05:27 -07:00
e64b2e1cd7 add documentation for cwrap plugins (#1474) 2017-05-04 17:50:58 -04:00
7d40140bfb Document squeeze behavior on 1-dimensional tensors of size 1. (#1470) 2017-05-04 16:54:22 +02:00
e50c7daaf9 Use Qr factorization to get orthogonal matrix in orthogonal init (#1453) 2017-05-04 07:11:59 -04:00
600f366a13 Merge commit 'a6876a4783ce3d1bb3c6ba69f54c31983097ed17' 2017-05-04 06:51:10 -04:00
a6876a4783 fix corner-case in MaxPooling 2017-05-04 06:50:15 -04:00
4e18d89791 added twice differentiation for a bunch of ops (#1426) 2017-05-04 06:47:14 -04:00
de9845588d Merge commit 'c061ed5bda238e1276601593343c10428d01eaae' 2017-05-03 23:14:26 -04:00
c061ed5bda handle beta=0 for gemv with transpose 2017-05-03 23:05:41 -04:00
e9d648c5e7 Fix memory leak introduced by 72e8190 (#1464) 2017-05-03 18:38:56 -04:00
80c0a8776b Fix #1447: sparse_mask doesn't make sense with uncoalesced tensors (#1458)
* Make sparseMask error if mask is uncoalesced.

Fixes #1447.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Add test for sparse adagrad.

Previously, the sparse codepath was not exercised at all; this commit
adds a very simple test case "sparse Rosenbrock"; the idea is to do
Rosenbrock but then knock out one of the dimensions so that the
tensor is sparse.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-03 17:53:45 -04:00
4ec0435b39 Report overall size of sparse tensors. (#1461)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-03 17:51:56 -04:00
f8be3a20d3 Fix scatter_ documentation typo. (#1463) 2017-05-03 17:31:04 -04:00
7b21b0b6d7 Retry on write EINTR in sync mode
Summary:
We weren't handling an edge case where write(2) would return EINTR
when in sync mode. The Pair::write function would return false
indicating it didn't complete the write whereas the send function
expects it to complete when in sync mode. With this change we now
advance the cursor and retry the write when fewer than expected bytes
were written.

Also see https://github.com/facebookincubator/gloo/issues/34

Reviewed By: andrewwdye

Differential Revision: D4996949

fbshipit-source-id: 3bad4fa3d0a01517f20b64904aa71410641fa60f
2017-05-03 14:26:26 -07:00
0910e0ac90 Fix memory leak in coalesce. (#1460)
Fixes #1449.

For future reference, we should have a doc explaining our ref-counting
conventions; it looks like this bug slipped by because we assumed that
newTensor was taking ownership of the pointers it was passed in.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-03 13:29:39 -04:00
93094294ba function backward attempted to multiply tuple by variables (#1459)
One line fix--changed it to multiple the grad_variables by the
len(variables) when grad_variables is None.
2017-05-03 13:12:21 -04:00
743e4894d2 Prefix values/indices/sparse_mask/nnz with underscore (#1457)
As discussed in #1441.

I also added some docs giving clear guidance about how to coalescing
in sparse tensors.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-03 11:14:10 -04:00
f273377d19 add device asserts in scatter/gather kernels 2017-05-03 11:12:26 -04:00
836332e0a1 Merge commit 'f1591fade5c8df5272b79ab1bd8b0b261bb5606a' 2017-05-03 11:11:43 -04:00
f1591fade5 add device asserts in scatter/gather kernels 2017-05-03 11:10:31 -04:00
2e7635b929 Add flexible bilinear upsampling aspect ratio redux (#1317) 2017-05-03 08:46:28 -04:00
e9953c4595 A number of post-merge fixes for test_sparse (#1444)
* Simplify _gen_sparse

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Randomly generate an uncoalesced tensor and test with it.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Simpler implementation of cpu_only suggested by @apaszke

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Better implementation of randn, suggested by @soumith

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Lint fix.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

* Fix CUDA type error.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-03 08:43:03 -04:00
72e8190994 Use at most one shared_ptr block at a time to manage THPFunctions (#1454)
* Fix failing ln in build_all.sh

* Use at most one shared_ptr block at a time to manage THPFunctions
2017-05-03 08:15:36 -04:00
e1278d4ee2 Fix typo in autograd docs 2017-05-03 03:11:55 -07:00
66bd200de0 bug fix - add previous slot offset to calculated slot value in halving-doubling algorithms
Summary: Previous slot offset was not added to the calculated value for the slot to be used in halving-doubling algorithms. If multiple instances were running, slot values could collide.

Reviewed By: pietern

Differential Revision: D4986618

fbshipit-source-id: 56b9220c91f31cc016d37e82907221460de70657
2017-05-02 16:19:55 -07:00
574cfe3cf3 Improve kthvalue documentation. (#1448)
1) Fix "kth" attr specification -- I can't get sphinx to generate `k`th,
but `k` th works with a space, unlike now where the highlighting continues
until the next attr.
2) Specify the size of the return tensors.
3) Add an example of the return tensor sizes with more than 1 dimension.
2017-05-02 17:22:02 -04:00
699755e04f Convert contiguous() call in adagrad to out-of-place coalesce. (#1446)
We missed this one in f2903332c7dce1fbb7d7d9f18dcfba8e853581df!

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-02 16:51:54 -04:00
fb07914c0c Recommendations for workflow when modifying C files. (#1443)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-02 15:46:45 -04:00
aa2ee86375 pytorch/thpp ~= facebook/thpp (#1445) 2017-05-02 15:46:10 -04:00
ecd51f8510 docs fixes 2017-05-02 15:42:33 -04:00
5aa1f769d3 Fix torch.dist documentation: function returns a float. (#1440) 2017-05-02 14:38:48 -04:00
eecc807a75 Keep track of number of in-flight send operations
Summary:
This helps guard against programming errors where waitSend is called
before send is called. It uses a std::atomic to keep overhead low.

Reviewed By: andrewwdye

Differential Revision: D4984604

fbshipit-source-id: 04a63b1ba088e3bcba0abff40771af666deb15e5
2017-05-02 09:35:46 -07:00
5386012164 Check return value of ibv_reg_mr for error
Summary:
This returns EFAULT when passing a GPU memory pointer (for GPUDirect)
and the ibverbs driver can't map the GPUs memory. Since the error is
pretty cryptic, crash with a more useful message.

```
terminate called after throwing an instance of 'gloo::EnforceNotMet'
  what(): [enforce fail at gloo/transport/ibverbs/buffer.cc:46] mr_ !=
  nullptr. ibv_reg_mr: Bad address (kernel module 'nv_peer_mem' not
  loaded; did you specify a GPU pointer?)
```

Reviewed By: andrewwdye

Differential Revision: D4982966

fbshipit-source-id: 72c220fe22a3bc59396cfff992ad5f0f9c5bf83a
2017-05-02 09:11:15 -07:00
4bf813e068 Document cdata non-NULL invariant, and consequence Python side. (#1435)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-02 11:17:20 -04:00
3b4bc721ef fix osx build and suppress clang warnings (#1432) 2017-05-02 09:33:24 -04:00
dca208b525 Refactor test_sparse to reduce boilerplate. (#1421)
* Refactor test_sparse to reduce boilerplate.

Instead of manually creating a helper function, threading an is_cuda
parameter around, and creating a test method for CUDA and non-CUDA
variants, we take a different approach:

- There is now some new member variables initialized in setUp which
  control the aspects of how we carry out the test; at the moment,
  it's just whether or not we are using CUDA or not.  This means
  you don't have to pass is_cuda around, or do a conditional to
  get the triplet of constructors you need.

  I'll note that I am not a big fan of member variables in test
  objects, but these are (intended to be) immutable so I think
  it should be OK.

- Instead of manually defining test_foo and test_foo_cuda, we now
  have a new TestCudaSparse class which overrides setUp (from above)
  to swap in the CUDA implementation.  Way less boilerplate, and NO
  metaprogramming needed.

  If you need to opt out of CUDA testing, there is a new cpu_only
  decorator you can use.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-01 21:52:58 -04:00
181cb15c72 Fix formatting error in docs.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-01 21:47:22 -04:00
7df8fbb64f Generalize halving-doubling to support non-power-of-two cases using binary blocks algorithm
Summary: A generalized version of halving-doubling that supports non-power-of-two number of processes by breaking up execution into blocks that are powers of two and communicating interblock after the intrablock reduce-scatter. Non-power-of-two cases will have some degree of load imbalance compared to power-of-two, but cases with few large blocks (e.g. 8 + 4 or 16 + 8) should still perform relatively well.

Reviewed By: pietern

Differential Revision: D4955947

fbshipit-source-id: af4f218fedb6adf475530c38386978b81f4f2b74
2017-05-01 16:05:22 -07:00
5c7453447f Fix bugs, rename differentiate to grad, make it more flexible 2017-05-01 16:44:56 -04:00
87164f554d Bug fixes 2017-05-01 16:44:56 -04:00
267e7c0431 Fix memory issues with Conv and BatchNorm 2017-05-01 16:44:56 -04:00
e5db8f98be Add torch.autograd.differentiate 2017-05-01 16:44:56 -04:00
20aa5b066f Convert some of the functions to new format
Also, fix a lot of issues that appeared after the previous commits.
2017-05-01 16:44:56 -04:00
de9998e198 Add support for the new Function format 2017-05-01 16:44:56 -04:00
702a2e3bc5 Make Variables not subclass Function anymore
Because of this Variables can no longer appear in the graph.
Every usage of a leaf Variable will leave an AccumulateGrad
function that has no outputs, but modifies var.grad as a side
effect.
2017-05-01 16:44:56 -04:00
2ca787fcf4 Refactor attribute names in autograd 2017-05-01 16:44:56 -04:00
2ec629bef9 Set SO_REUSEADDR to try and prevent bind errors
Summary:
After running the test suite many times we end up with a zillion
connections in TIME_WAIT state. Setting SO_REUSEADDR seems like it
should help binding to ports regardless of the TIME_WAIT state.

Reviewed By: andrewwdye

Differential Revision: D4979606

fbshipit-source-id: b611f9c9e11aba858dc192f6bca3d64e10100b52
2017-05-01 13:36:14 -07:00
2197e4c766 version bump 2017-05-01 15:54:52 -04:00
2a28283680 Fix pair destructor if in CONNECTING state
Summary:
It can happen that a pair is destructed while in CONNECTING
state when some unrelated code throws an exception after the connect
function has been called. The most likely place for this to happen is
when connecting pair A is in progress while connecting pair B throws
an exception. The exception will force destruction of all references
to pair A, even if it is in the CONNECTING state.

Also see https://github.com/facebookincubator/gloo/issues/33

Reviewed By: andrewwdye

Differential Revision: D4979557

fbshipit-source-id: 0cddddd3f478106f1694603fe7f2efe15a2d9aa1
2017-05-01 12:41:07 -07:00
4624278b1d Make sparse documentation title consistent with others. (#1420)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-05-01 11:48:00 -04:00
79d4ac670c Add map_location to load_url (#1418) 2017-05-01 10:21:30 -04:00
4ebf3ff46d Add base for CUDA allReduce and broadcast in DataChannelGloo 2017-05-01 01:49:10 -07:00
ac3ba9a2ad Rebase fixes 2017-05-01 01:49:10 -07:00
14e1bfddbc Change warning message in MPI 2017-05-01 01:49:10 -07:00
c19fbd3364 Update comments; Add inline accessors for value_type tuple in GlooCache 2017-05-01 01:49:10 -07:00
a17d96d571 Add multiple thread support for DataChannels
Previously, when using same data channel in multiple thread environment,
one didn't have any guarantee that there won't be any deadlocks
or even errors.
2017-05-01 01:49:10 -07:00
b7dcc29430 Forward declare GlooCache key_type 2017-05-01 01:49:10 -07:00
18b4dcd28b Remove unused variable in macro 2017-05-01 01:49:10 -07:00
be81304d27 Moved GlooCache to new file; Functions renames; Minor fixes 2017-05-01 01:49:10 -07:00
f07f13c6e9 Change Store exception handling 2017-05-01 01:49:10 -07:00
310d08c37b Fix store and all operations 2017-05-01 01:49:10 -07:00
234df2138a Fix compilation errors 2017-05-01 01:49:10 -07:00
2b340e7d50 Add python tests; Remove broken prefix store creation 2017-05-01 01:49:09 -07:00
6888c61fa8 Fix DataChannelGloo compilation 2017-05-01 01:49:09 -07:00
ba3328b365 Add DataChannelGloo tests 2017-05-01 01:49:09 -07:00
3b4fe5dfc4 Add isend/irecv; Add all types generator for template functions; Minor refactor 2017-05-01 01:49:09 -07:00
ce42761628 Add groups 2017-05-01 01:49:09 -07:00
df4791d6c0 Implement DataChannelGloo 2017-05-01 01:49:09 -07:00
7e8830c3d5 Initial gloo bindings 2017-05-01 01:49:09 -07:00
b91cec7f66 Fix THD library build for CUDA 2017-05-01 01:49:09 -07:00
765aeb1a08 Fix nonzero bug 2017-05-01 01:49:09 -07:00
280e2a94e5 Worker init clarification; Inform on error thread notification failure 2017-05-01 01:49:09 -07:00
e7f453b5de Add barrier to test; Minor changes; 2017-05-01 01:49:09 -07:00
8030aa0f1b Refactor error thread 2017-05-01 01:49:09 -07:00
40ad2cde62 Remove unnecessary nonzeroElems function 2017-05-01 01:49:09 -07:00
af4a978c44 Move error thread to CommandChannel; Minor fixes; 2017-05-01 01:49:09 -07:00
fe5fc6723f Remove unnecessary code 2017-05-01 01:49:09 -07:00
6e6179633b Minor fixes in THDMasterWorkerInit 2017-05-01 01:49:09 -07:00
c97e60c45d Add actual error reporting in Master 2017-05-01 01:49:09 -07:00
2cdb368f97 Add error handling in MasterWorker mode 2017-05-01 01:49:09 -07:00
a5b2f3461a Review fixes 2017-05-01 01:49:09 -07:00
d3e60599d2 Add benchmark scripts (#66) 2017-05-01 01:49:09 -07:00
98d8e0b040 Lapack functions implementation #2 + fixes after review 2017-05-01 01:49:09 -07:00
fe2c360eda Lapack function implementation #1 2017-05-01 01:49:08 -07:00
59ae109bbb Implement functions from set 1 (except Lapack) 2017-05-01 01:49:08 -07:00
8623076654 Add convertToRank to do bound checking 2017-05-01 01:49:08 -07:00
a362b4f367 Add support for unsigned char aka byte to MPI 2017-05-01 01:49:08 -07:00
ef724e355c Change rank type: int -> std::uint32_t; Minor fixes 2017-05-01 01:49:08 -07:00
e863d27393 Tweaks, fixes, cleanup in DataChannelTCP 2017-05-01 01:49:08 -07:00
4c388f9398 Revert structure changes; Minor fixes 2017-05-01 01:49:08 -07:00
6740d1d904 Rewrite CommandChannel 2017-05-01 01:49:08 -07:00
f891d9b1bf Don't build tests by default 2017-05-01 01:49:08 -07:00
a81f330854 Rename construct -> new; Minor fixes 2017-05-01 01:49:08 -07:00
c02241edbd Minor code refactor 2017-05-01 01:49:08 -07:00
f30a92fa17 Fix invalid socket initialization 2017-05-01 01:49:08 -07:00
1391ff99f4 Use TCP_NODELAY for data sockets 2017-05-01 01:49:08 -07:00
43019bd88a Always loop over all possible addresses in worker 2017-05-01 01:49:08 -07:00
d6380910f5 Removed unnecessary code; Minor fixes 2017-05-01 01:49:08 -07:00
04491e84e4 Fix build with CUDA 2017-05-01 01:49:08 -07:00
e247249a5f Implement TH_API functions from the set 4 2017-05-01 01:49:08 -07:00
0160438eb9 added logical not operator for ByteTensor (#1403) 2017-04-30 08:47:24 -04:00
7dd8571bc6 fix avg_pool docs in nn.functional 2017-04-30 08:44:43 -04:00
48a7869b23 Doc fixes (#1409) 2017-04-30 08:28:19 -04:00
582fd3db7d fix osx build 2017-04-29 09:29:57 -04:00
9169f60a84 Parallelize TensorMethods.cpp builds (#1400) 2017-04-29 09:07:21 -04:00
457d78a7d9 Use THCUNN backward kernels for Tanh and Sigmoid in Autograd (#1399) 2017-04-29 09:07:03 -04:00
a071ccbea6 fix NCCL makefile for CUDA 7.5 (#1401) 2017-04-29 09:04:01 -04:00
db1eb66456 corrected docstring for Dropout (#1404) 2017-04-29 13:40:47 +02:00
45020a74cd remove inplace pow and fix contiguous -> coalesce (#1398) 2017-04-28 18:26:29 -04:00
9c01f5d6b2 Document hybrid sparse tensors.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-04-28 23:53:01 +02:00
cbb9f08b71 Add new init methods gain, eye and dirac (#1172) 2017-04-28 17:16:40 -04:00
f75ab857b8 Add safeCoalesce() to tests 2017-04-28 17:11:05 -04:00
f2903332c7 Make coalesce() out of place 2017-04-28 17:11:05 -04:00
9643be76f9 speed up accumulation 2017-04-28 17:11:05 -04:00
4f09461d24 Rename sparse tensor contiguous() to coalesce() 2017-04-28 17:11:05 -04:00
bafb2e5cc2 Implement sparse pow. (#1387) 2017-04-28 23:06:09 +02:00
28a7fbbdf5 Documentation fix for torch.gather 2017-04-28 22:45:14 +02:00
4c1cdb6148 Refactor Python string utility function 2017-04-28 21:25:26 +02:00
775481ed56 re-enable dilated convolutions on Kepler (#1394) 2017-04-28 14:42:19 -04:00
5b2aac7c73 Merge commit '224f5eabf5cfb3a19abc1819f7dac230500b6bdb' 2017-04-28 13:48:06 -04:00
224f5eabf5 half<->float conversion cleanup (#680) 2017-04-28 19:46:42 +02:00
fd490c6490 Merge commit 'd6a31c68a0f39656257322a55c9e04dd579de828' 2017-04-28 13:42:23 -04:00
d6a31c68a0 Add option to disable ppc64le's VSX support
Set environment variable TH_NO_VSX=1 to disable VSX.
2017-04-28 13:41:03 -04:00
96a281dfab Add one more missing self.dilation parameter. (#1392)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-04-28 19:16:32 +02:00
94b147fd41 Allows dicts batches in dataloader. (#1354)
* Allow dicts in Dataloader

* use collections.Sequence instead of collections.Iterable in dataloader
2017-04-28 19:14:52 +02:00
c26f6877a0 guard topk for half (#759) 2017-04-28 11:57:15 -04:00
8908000262 function -> lambda in test 2017-04-28 10:31:40 -04:00
8b1d5727d8 fix minor docs 2017-04-28 10:13:52 -04:00
75f1989bec Add nn.Bilinear and tests 2017-04-28 10:11:30 -04:00
e221536ad8 Merge commit 'a44317fea88adddded91e068088415de1e66fd4b' 2017-04-28 08:04:39 -04:00
a44317fea8 Change magma_sgesvd to magma_sgesdd which is significantly faster 2017-04-28 08:03:39 -04:00
24e5a9057e Revert "Parallelize TensorMethods.cpp builds (#1364)" (#1390)
This reverts commit 060048bcd808893ba3113d09273a42642904078a.
2017-04-28 07:59:40 -04:00
060048bcd8 Parallelize TensorMethods.cpp builds (#1364) 2017-04-28 07:45:21 -04:00
77035d151e make topk test unique 2017-04-28 07:30:25 -04:00
50c9c23525 enable topk for all cuda 2017-04-28 07:14:21 -04:00
3f81803b09 Merge commit '69574a6dc4036b0113c512a1b2d74e23682c8a3b' 2017-04-28 07:08:43 -04:00
d421c473a9 Merge commit '928f6516c16ff91c0a789d0a653551041d1bafd0' 2017-04-28 07:07:24 -04:00
48f9e526ea implement expand/expandAs in CPU/GPU code 2017-04-28 07:06:25 -04:00
69574a6dc4 implement expand/expandAs in CPU/GPU code 2017-04-28 07:04:08 -04:00
928f6516c1 implement expand/expandAs in CPU/GPU code 2017-04-28 07:03:51 -04:00
b93b525a1c Enable specifying of margin in HingeEmbeddingLoss (#1378)
Previously it was not possible to set a value for the margin in the HingeEmbeddingLoss in the constructor. This patch fixes the issue and makes the loss behave as it is described in the docs. 

A discussion of this issue can be viewed here:
https://discuss.pytorch.org/t/issue-with-setting-margin-for-hingeembeddingloss/2088
2017-04-28 06:58:48 -04:00
8db2cf6182 temp fix for transposed dilated convolution (#1388) 2017-04-28 02:53:37 +02:00
7e8ef0e22a Actually pass dilation to the underlying operators. (#1386)
No tests for now; we'll need some sort of shape DSL to concisely
represent them.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-04-27 23:38:01 +02:00
27990fee54 Use fully qualified name as tp_name for tensors and storages (#1379) 2017-04-27 16:26:44 -04:00
2ef7331007 Update sparse.py 2017-04-27 02:25:00 +02:00
c2cfa4cf5b Add THGenerate*Type.h for all types (#1014) 2017-04-27 01:11:56 +02:00
c915f8ddbf Signal error on connection error instead of asserting
Summary: No need to assert on connection errors.

Reviewed By: andrewwdye

Differential Revision: D4957698

fbshipit-source-id: b47f6f0f098dbf7d212701c5cb68e34b2c1c9522
2017-04-26 16:07:13 -07:00
b39a2f2cbb Documentation for sparse tensors. (#1366) 2017-04-26 21:43:05 +02:00
d9f01397b3 s/NOCUDA/NO_CUDA/
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-04-26 21:42:09 +02:00
8ca7bf2ab3 Check argument types in 'checkTypes' (#1363)
Fixes #1357
2017-04-26 15:00:41 -04:00
8950f41da3 Install CUDA headers.
Summary:
This PR makes cmake installs the gloo CUDA headers if USE_CUDA is enabled.
Closes https://github.com/facebookincubator/gloo/pull/29

Differential Revision: D4946856

Pulled By: pietern

fbshipit-source-id: a688c3794c4a5e34b664e7bdeb4e1148f6504419
2017-04-25 22:42:12 -07:00
afd01164f8 Install missing headers.
Summary:
This PR installs missing include headers.
Closes https://github.com/facebookincubator/gloo/pull/30

Differential Revision: D4946478

Pulled By: pietern

fbshipit-source-id: da2d532afc43cf9e5e7fc764dc7821e2dfca6b37
2017-04-25 09:42:21 -07:00
a123247240 Move SIGPIPE initializer to test main
Summary:
It should be up to the program including Gloo to ignore SIGPIPE.
We have seen a case where the EPIPE errno is not properly handled in
an unrelated piece of code. Having SIGPIPE fire means we can get a
core and debug this further.

Reviewed By: andrewwdye

Differential Revision: D4896727

fbshipit-source-id: f6fe2d3f8dc68a9e6c2c457639b45f8aee2d7b20
2017-04-25 09:08:27 -07:00
41705ce7d5 Add zero padding module (#1326) 2017-04-25 16:58:51 +02:00
88fc1d39ff Generic TopK implementation (#744)
* move TopK to generic

* partial genericization of kernel code

* introduce TopKTypeConfig, specialize radix type and conversion for floats

* implement topk for byte tensor

* implement for char tensor

* implement for int tensor, extend test to check indices as well

* works for longs too

* make bitfield set/get a struct, add support for 64-bit types

* extend to double tensor

* implement for half tensor

* asserts; test fix
2017-04-25 16:39:20 +02:00
9899512401 Remove common.h from root
Summary: This file was left over after a recent refactoring but is not used.

Reviewed By: andrewwdye

Differential Revision: D4940265

fbshipit-source-id: 01f8c5fbc73dd0ca0a92306dbfef22ff28133750
2017-04-24 13:51:15 -07:00
d95feb3feb Only build on 64-bit systems
Summary:
While it is theoretically possible to make Gloo work on 32-bit systems, it's unlikely anybody would ever use it on 32-bit systems. This removes the expectation that it should work...

Fixes #28
Closes https://github.com/facebookincubator/gloo/pull/31

Differential Revision: D4939073

Pulled By: pietern

fbshipit-source-id: 8c60804f7ae5cf835332871a424aefa2c498e8a4
2017-04-24 10:38:45 -07:00
3ab074b3c5 Fix torch.stack() with Variable inputs (#1345) 2017-04-24 12:20:51 -04:00
6a69f7007b Revert "add keyword out for autograd function Concat to match torch.cat (#1336)" (#1340)
This reverts commit 71b9dea6ecc2278511ba6c2531437d27d9a2b8c8.
2017-04-23 19:19:27 +02:00
71b9dea6ec add keyword out for autograd function Concat to match torch.cat (#1336) 2017-04-23 15:36:24 +02:00
fa4f363b93 Instance norm (#1283)
* instance norm

* fix whitespaces

* whitespaces

* docs

* "C" letter was cyrillic in docs, fixed

* remove force_eval, fix non contiguous case
2017-04-23 14:49:15 +02:00
aab30d4ea2 Fix errors when no CUDA devices are available (#1334)
Fixes #1267

This fixes a number of issues when PyTorch was compiled with CUDA
support but run on a machine without any GPUs. Now, we treat all errors
from cudaGetDeviceCount() as if the machine has no devices.
2017-04-23 14:45:27 +02:00
2b56711c24 Indexing fix for fused GRU/LSTM kernels when all tensors are not contiguous. (#1325) 2017-04-22 04:22:32 -04:00
2fa3365f94 Merge commit '5224fc56b03b6468cb85ccf39034b8ab0d76d04e' 2017-04-22 01:14:34 -07:00
5224fc56b0 fix typo 2017-04-22 10:14:09 +02:00
4373580e6b Merge commit 'e80a3a7f7b8d0e179c1481e0744f08e9385b31f3' 2017-04-22 01:11:10 -07:00
d9406a8a1a Merge commit '10387a3f35573462e18219c321ff550757ce9b09' 2017-04-22 01:10:53 -07:00
e80a3a7f7b Indexing fix for fused GRU/LSTM kernels when all tensors are not contiguous. 2017-04-22 01:09:46 -07:00
5b83fe6781 add contiguous checks 2017-04-22 09:57:36 +02:00
24d92b5d9f Concatenate directly into shared memory when constructing batches (#1323)
This saves an extra memory copy, which speeds up data loading a bit
(5-10% with accimage).

As part of this change:

 * torch.cat accepts keyword argument out
 * sepcifiying out=None is treated like not specifying out
2017-04-22 03:40:30 -04:00
1375694853 Document torchvision members 2017-04-21 12:50:36 -07:00
be5e399d46 Add a simple README for torch/lib. (#1322)
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-04-21 15:06:12 -04:00
10387a3f35 fix gradBias checks 2017-04-20 19:21:50 -04:00
a782a6231f Merge commit 'e788ea40de0f7ef393f1b602098a6775a95d8976' 2017-04-20 19:00:45 -04:00
e788ea40de fix typo in TH_APPLY for _dimOffset 2017-04-20 18:59:12 -04:00
6089900011 grammar/typo: "There's 3" -> "There are three"
Summary: Closes https://github.com/facebookincubator/gloo/pull/27

Differential Revision: D4919746

Pulled By: pietern

fbshipit-source-id: 35733b75fc169d2ccff8b10df013eed8c279dfd5
2017-04-20 15:19:56 -07:00
81345306c8 Merge commit '8236d38e81396ac48697ac289c0476cff18a8e08' 2017-04-20 15:03:48 -07:00
f0a19e2617 Merge commit '331219c5506b26bf0906b7acdafb4823e07a924e' 2017-04-20 15:01:22 -07:00
8236d38e81 add cusparse link dependency 2017-04-20 14:31:30 -07:00
8adf8fe2ed create and expose handles for cusparse 2017-04-20 14:30:14 -07:00
d2472d1ab5 Disable cudnn dilated convolutions for kepler. (#1308) 2017-04-20 15:31:45 -04:00
331219c550 define abs for short too 2017-04-20 09:55:17 -07:00
7805ac9098 Base Store::wait() should ignore timeout for back compat
Summary: PrefixStore::wait() uses a default timeout if unspecified. This is incompatible when using PrefixStore to wrap a Store implementation that does not support timeout. Instead the base Store::wait(keys, timeout) implementation is called, throwing an exception. This change modifies the base implementation to ignore the timeout.

Differential Revision: D4916517

fbshipit-source-id: 3cdd83bd209bf938b58442d82f3fc245e68019ad
2017-04-19 16:49:44 -07:00
5f65ee9ca0 Add more newContiguous calls and checks 2017-04-19 14:01:31 -07:00
f9149b1f2e Fix halving-doubling corner cases
Summary: Fixes for corner cases with small element counts. Fixed problems include (1) calling range on out of bounds pointers, (2) failing to allocate send or receive buffers in cases where they correspond to out of bounds indices for reduce-scatter, but are needed in the allgather, (3) not allocating enough receive buffer space (more than count_ bytes may be needed in some cases)

Reviewed By: pietern

Differential Revision: D4912656

fbshipit-source-id: 0409d01894ff9c93ef1a1fdf8021c9ecf62f9b57
2017-04-19 12:20:28 -07:00
a8e6610e3d Fix argument typo in pad_packed_sequence docstring (#1300) 2017-04-19 13:50:59 -04:00
56cc1e219b Fix include in mpi/context.cc
Summary:
memcpy comes from cstring

See https://github.com/caffe2/caffe2/issues/286

Reviewed By: Yangqing

Differential Revision: D4914228

fbshipit-source-id: de60c2a98feb4228546a8f1fe237a090101f50e4
2017-04-19 10:19:55 -07:00
1607042bf4 Add timeout parameter and default to rendezvous Store::wait()
Summary: TSIA. Defaulting to 30s.

Reviewed By: pietern

Differential Revision: D4909202

fbshipit-source-id: 7f86f390077a19e559c90a1aa3aa768e273325d1
2017-04-19 10:11:56 -07:00
7d023cda6c Add timeout to RedisStore::wait()
Summary: Add a default 60s timeout to RedisStore::wait() to avoid blocking indefinitely	when peer machines are unavailable.

Reviewed By: pietern

Differential Revision: D4908699

fbshipit-source-id: 39de9066633e8b0c8d1ee198b6bf3f70d3961196
2017-04-19 09:58:05 -07:00
9e8b4ef075 Include THCNumerics.cuh in THCAtomics.cuh. (#752) 2017-04-19 12:08:22 -04:00
a35f507532 Update functional.py (#1298) 2017-04-19 11:07:12 -04:00
6aa22beb86 Fix loss.py docs (#1296) 2017-04-19 11:03:15 -04:00
71bf8fb55b Clean up fd from destructor when in listening state
Summary:
It's possible the pair is in the listening state when it is
destructed. The fd will not have been cleaned up in that case, so we
shouldn't assert that being the case.

Reviewed By: andrewwdye

Differential Revision: D4909964

fbshipit-source-id: 7103d74910e3bcf5de9f4658d8f1f682b6c8a70c
2017-04-18 17:51:49 -07:00
c7d83a16f6 Update README.md 2017-04-18 19:05:18 -04:00
934816c01c Change the default algo for cuDNN conv forward to PRECOMP_GEMM (#1290) 2017-04-18 19:01:47 -04:00
5a0510934f Merge commit 'fcf4deac7d215f134ea25cd3def8b564b58b033c' 2017-04-18 15:21:20 -07:00
fc19473501 Corrections in legacy modules. (#1286) 2017-04-18 17:13:53 -04:00
34546f022a Expose dilated convolutions.
Fixes #1225.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-04-18 17:13:02 -04:00
ab77742f6e Add some missing documentation for arguments.
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
2017-04-18 17:13:02 -04:00
701e63107f speed improvements, fix tests 2017-04-18 12:46:54 -07:00
655c22569e CPU hspmm + more efficient reorder 2017-04-18 12:46:54 -07:00
cd3bbc9dfd more operations and optimizations (hspmm, reorder, ...) 2017-04-18 12:46:54 -07:00
1018b238ac make gradients contiguous in adagrad 2017-04-18 12:46:54 -07:00
e27bd4ce7a faster cadd 2017-04-18 12:46:54 -07:00
b2acc33c73 contiguousValues method 2017-04-18 12:46:54 -07:00
40804830b8 mark_contiguous operation 2017-04-18 12:46:54 -07:00
01d84c5f9d revert sparse cuda index type change 2017-04-18 12:46:54 -07:00
88b42324e7 spcadd, sparseMask, cadd, csub, cmul + tests 2017-04-18 12:46:54 -07:00
ec260fe8e9 add test for dsmm 2017-04-18 12:46:54 -07:00
328b416068 THCS contiguous + to_dense 2017-04-18 12:46:54 -07:00
4bde9efbd7 Update CONTRIBUTING.md 2017-04-18 15:39:58 -04:00
ff781ed059 Update CONTRIBUTING.md 2017-04-18 15:39:26 -04:00
8f9a1af253 Merge commit 'fcf4deac7d215f134ea25cd3def8b564b58b033c' 2017-04-18 12:22:44 -07:00
31900b6bae Merge commit '1feb120d938d47c01900f656322f16bc41d08af3' 2017-04-18 12:22:27 -07:00
46cf6ff5fb fix batchnorm docs (#1284) 2017-04-18 15:12:38 -04:00
fcf4deac7d Fused RNN kernel remove explicit instantiation, isn't needed. 2017-04-18 11:07:58 -07:00
1feb120d93 Mark input as optional for gradInput in Tanh and Sigmoid 2017-04-18 10:33:33 -07:00
2ca071d730 Remove double precision math from LogSigmoid too 2017-04-18 10:28:13 -07:00
8a901c510d Update ops for Sigmoid and Tanh 2017-04-18 09:55:11 -07:00
ed60fe0ed6 Gloo benchmarking and script updates
Summary: Add AllgatherRing and CudaBroadcastOneToAll to benchmark. Add host info and algorithm sweep to chronos script.

Reviewed By: pietern

Differential Revision: D4901111

fbshipit-source-id: 1421025d39b914b14e857f21c43eac30c9c9dd2f
2017-04-18 09:06:34 -07:00
f67ab32d34 Output peer address on network failures
Summary: Output peer address on network failures. This change will help in root causing network failures.

Differential Revision: D4899129

fbshipit-source-id: 60a762c6551a726081d5335ab478da8dd7f6dad7
2017-04-17 13:50:24 -07:00
9150e33765 Add support for creating docsets. (#1276)
Docsets are an offline documentation format introduced by Dash.app and
supported by Zeal and some other open-source clones.
2017-04-17 16:35:02 -04:00
e4478804ce Fix patched_make_field for newer Sphinx versions. (#1275)
Not sure since which version that change is needed, but using v1.5.5 here.
2017-04-17 16:17:58 -04:00
a220f2c3aa Fix group-convolution w/o biases on CPU. (#1273)
* Fix group-convolution w/o biases on CPU.

Not having this guard will cause a crash further down in the `cat`
function when it uses the first element in the passed list to create a
new tensor. (And even after that, cat doesn't handle nulls well.)

* Added test for groupconv w/o bias on CPU.
2017-04-17 14:53:28 -04:00
15267ac009 fix typo 2017-04-15 13:08:58 -04:00
0cb60e7d5a Retrieve ethernet interface link speed
Summary: Retrieve ethernet interface link speed

Reviewed By: pietern

Differential Revision: D4880290

fbshipit-source-id: 91f1555d9bb35ff41dc731e082365a9002bb1661
2017-04-14 14:41:01 -07:00
b61174047f Add threshold to switch between host/device reduce and bcast depending on buffer size
Summary: Device reduce is more efficient for large buffer sizes. For smaller buffers, host reduce may be more efficient in some cases and frees up the GPU for other work.

Reviewed By: andrewwdye

Differential Revision: D4885855

fbshipit-source-id: 7dc522e8c93e1a94427730aca6af03b7e93e660d
2017-04-13 15:05:47 -07:00
8d93fcf13f Don't allow overwriting keys in HashStore
Summary: TSIA

Reviewed By: andrewwdye

Differential Revision: D4885102

fbshipit-source-id: c46c180fa8e6dd354921d562830b3515ba91c964
2017-04-13 12:35:32 -07:00
a559893c9f Instantiate nccl type templates for gloo (minus half)
Summary:
Instantiate nccl type templates for gloo (minus half).
half requires at a minumum ifdefing CUDA_HAS_HALF and likely requires
more work given that operators aren't defined on it, so skipping it
for now.

Reviewed By: pietern

Differential Revision: D4876217

fbshipit-source-id: 833d2aec12789cbaf9e0a201b979a420fbe6732f
2017-04-13 10:52:38 -07:00
50c2759afe Expose missing headers
Summary: Closes https://github.com/facebookincubator/gloo/pull/25

Differential Revision: D4883908

Pulled By: pietern

fbshipit-source-id: 662a8fdf83ad099295b11043194de25c747e8286
2017-04-13 10:08:06 -07:00
cb66e9cf78 torch.diag bug fix (#1251) 2017-04-12 20:59:12 -07:00
735f5af87e Add new variant of halving/doubling algorithm that pipelines local reduce/broadcast with communication steps
Summary: Added a pipelined version of cuda halving/doubling algorithm. Half the buffer is reduced prior to first send and the other half prior to reducing the result from first receive. Broadcasts are started asynchronously as soon as each new message is received. New code was added as a new algorithm, as pipelining makes performance worse for small buffer sizes.

Reviewed By: pietern

Differential Revision: D4847109

fbshipit-source-id: 5aa55de95f8c94069380af7396f2b5b6297dcbea
2017-04-12 18:01:22 -07:00
c852883086 add named_parameters that yield name and value of parameters (#1242) 2017-04-12 16:32:36 -07:00
ab77e4c3d7 Merge commit '62c584ba7972dbba404766aa06d1a558282b4169' 2017-04-12 15:06:58 -07:00
2444278b8b Merge commit '4336e9ea6641b8ac2814eaef2adef64e4106459c' 2017-04-12 15:06:10 -07:00
62c584ba79 Fix abs with char and short cuda types. (#747) 2017-04-12 15:04:59 -07:00
fbd53d87bf block wide reduction with multiple values to reduce at once (#745) 2017-04-12 15:04:43 -07:00
71303b8af4 Autograd deadlock for recent glibc fix (#1243) 2017-04-12 22:24:31 +02:00
4336e9ea66 Revert "make it compile on Windows + use ilp64 MKL" (#1002) 2017-04-12 12:07:16 -07:00
d48afd41f9 Add print string for MaxPool3d, change for MaxPool2d (#1115) 2017-04-12 15:58:28 +02:00
e21e4bf3e8 add pyyaml to conda note here as well 2017-04-11 21:21:18 -07:00
8e36339911 Merge commit '0925c91e80cc1b3a86fcbc54570f5bb204c9cb77' 2017-04-11 18:00:44 -07:00
5391fe8953 addr zeroes output buffer when beta=0 2017-04-11 18:00:11 -07:00
0925c91e80 addr zeroes output buffer when beta=0 2017-04-11 17:59:42 -07:00
253c854da5 update Dockerfile not to use requirements.txt 2017-04-11 15:42:05 -07:00
7c59754d24 update source build instructions 2017-04-11 15:24:31 -07:00
2bf7dc643f Merge commit 'aec658f8708a6f4448329da006d14ff2e13dc821' 2017-04-11 15:02:36 -07:00
ce30c76823 Merge commit '2b37ecfccf810a8e21c2c9ac9a943ce2f7c01015' 2017-04-11 15:02:16 -07:00
a8d60ad3ac fix THNN headers 2017-04-11 15:00:30 -07:00
aec658f870 fix THNN headers 2017-04-11 14:57:11 -07:00
2b37ecfccf fix THNN headers 2017-04-11 14:56:53 -07:00
01a35dcace Fix coalesced CUDA collectives for nonhomogeneous lists 2017-04-11 14:48:54 -07:00
afeeb81e79 Add support for keyword arguments in torch.cat 2017-04-11 14:48:54 -07:00
6002f94232 Fix is_tensor and is_storage for old-style classes 2017-04-11 14:48:54 -07:00
a5c7d98611 Import TripletMarginLoss 2017-04-11 14:48:54 -07:00
605b3c86ce Retain the type of numpy scalars in collate_fn 2017-04-11 14:48:54 -07:00
2087b1157a Improve serialization error messages 2017-04-11 14:48:54 -07:00
81e972031d Handle all errors if Module's sources can't be retrieved 2017-04-11 14:48:54 -07:00
e9ff57176b Fused pointwise kernels for GRU/LSTM 2017-04-11 13:42:06 -07:00
a739960515 Merge commit 'cfa504691c2ce5e10010ffb6cd43001c59109aea' 2017-04-11 13:41:54 -07:00
f43320dbf2 Merge commit '0dc52abe9a673547caf79ac64c73e8e16fb37b33' 2017-04-11 13:41:42 -07:00
cfa504691c Fused pointwise kernels for GRU/LSTM 2017-04-11 13:36:38 -07:00
0dc52abe9a Fused pointwise kernels for GRU/LSTM 2017-04-11 13:36:02 -07:00
0b50f794e9 Use thnn version of Tanh/Sigmoid instead of autograd. (#1234) 2017-04-11 12:49:57 -07:00
2abbb5133c Fixing function signatures: long -> ptrdiff_t (#1232) 2017-04-11 11:37:21 -07:00
fcf8387779 Fix ibv_devices wrapper if device list is empty
Summary: TSIA

Reviewed By: andrewwdye

Differential Revision: D4866469

fbshipit-source-id: 6bbde8ec9d71ea89ccdab379d48d122b90237460
2017-04-11 11:04:54 -07:00
ade105fb7c update README to install pyyaml from conda (#1231) 2017-04-11 10:23:45 -07:00
4e693d12ab Merge commit '79c4cb96b16dac603247ffd88c473e84565915a9' 2017-04-10 14:35:54 -07:00
79c4cb96b1 fix memory leak in btrisolve and getri 2017-04-10 14:35:07 -07:00
97bd6aae37 Throw error if Redis replies with error
Summary:
The code already asserted, but only on the reply type, so it didn't
include the actual error message. This makes debugging problems much
easier when people have problems running the benchmark suite.

Differential Revision: D4860022

fbshipit-source-id: 659bc461a724603375bff18eac90eca658492b05
2017-04-10 10:49:59 -07:00
f618ea9f31 Update README.md
Summary:
Mention GPUDirect in README
Closes https://github.com/facebookincubator/gloo/pull/24

Differential Revision: D4860167

Pulled By: pietern

fbshipit-source-id: 80804c778cdc6a9bcd8febe7e05142145cc6c61b
2017-04-10 10:49:59 -07:00
f6fef3718e fix typo in autograd.rst (#1219) 2017-04-10 01:16:59 -04:00
3fcdd6a42b Reuse sockaddr information from device
Summary: This is cheaper than doing getaddrinfo for every pair.

Reviewed By: andrewwdye

Differential Revision: D4850102

fbshipit-source-id: e77f468f099f63860b52fdd0dcc57a8a7a91a448
2017-04-09 16:37:41 -07:00
707c1ca4cc Function to retrieve PCI bus ID from device
Summary:
Part of this change is to perform a getaddrinfo in the TCP device
class so we can figure out the interface and subsequently PCI bus ID
of the NIC used for its traffic. This information can be used in a
later diff to avoid doing getaddrinfo calls in the TCP pairs and have
them reuse the information that is resolved by the device.

The PCI bus ID can be used to compute distance between NICs and GPUs
and make informed decisions on where to allocate scratch buffers.

Reviewed By: andrewwdye

Differential Revision: D4850035

fbshipit-source-id: 575e401a9273300bc720c814fef8971846ec748c
2017-04-09 16:37:41 -07:00
bc0ed9298d remove incorrect version in readme 2017-04-09 14:44:44 -04:00
040cf42643 Merge pull request #455 from twitter-forks/indexlinear
Adding Indexlinear
2017-04-09 13:52:56 -04:00
6d9ad1d66a Adding IndexLinear (#1181)
* Add IndexLinear

* Fixes to IndexLinear

- Fix IndexLinear test
- make it better for multithreaded case
- fix a glitch in the C code
- improve the reset() method
- fix the weight allocation.
- remove "fakeBatch" possibility as it's not used
- clamp normalized values at evaluation time instead of just dividing by max.
- add assert on the keys/values dimensions in IndexLinear.
- invert order of weightDecay in the case of output dim > 1.

* Changes required to support IndexLinear in CUDA

* Adding support for flattened inputs for IndexLinear

* Doc for IndexLinear + fix for when the input format changes from one batch to another.

* Cleaning up IndexLinear documentation

* Changes required to build with latest torch

* Adding benchmark script for IndexLinear

* Bugfixes and cleanup of IndexLinear.lua

- Fixed bug that occurs when performing multiple accGradParams +
  updateParams

- All the data required for the updates is put in a single table

- Added :pararameters method
2017-04-09 13:51:45 -04:00
64ee4056d7 updated docker image inside the docs (#1216) 2017-04-08 10:29:03 -04:00
55d69b5ade Merge commit '88bcfc15316e3c878237a8f95aeb6e72402c90ff' 2017-04-07 17:20:52 -07:00
0d7d6e1f0d Merge commit '662163bef68a9d64f3cb13a903638c870c0b4aa6' 2017-04-07 17:20:15 -07:00
b16a352a3b Fix remainder and cremainder for integer types 2017-04-07 17:17:44 -07:00
88bcfc1531 Fix remainder and cremainder for integer types 2017-04-07 17:16:59 -07:00
662163bef6 Fix remainder and cremainder for integer types 2017-04-07 17:16:31 -07:00
4026593240 check for beta=0 and avoid multiply in sparse mm (#1211)
* check for beta=0 and avoid multiply in sparse mm
2017-04-07 20:14:32 -04:00
a931064a52 Merge commit '441d75ce569f89bad3e2f1f2a2075e68ae3bc76b' 2017-04-07 16:57:05 -07:00
441d75ce56 Adapts basic operations to new THXVector interface 2017-04-07 16:56:12 -07:00
3de56785fa fix conv1d test and add for padding 2017-04-07 13:56:02 -07:00
5ee8536a02 Merge commit 'a89317a9d407241c97fe4486b3c88de8578445d7' 2017-04-07 13:49:18 -07:00
f00a5d2f54 Merge commit '66a20e5c328836c1eb720cf4e2eb916366aae487' 2017-04-07 13:47:25 -07:00
a89317a9d4 fix types in unfold.c 2017-04-07 13:32:04 -07:00
e48db02e10 remove unused python-level BatchNorm.py 2017-04-07 16:27:16 -04:00
7f2553bc6f dont use cudnn batchnorm for cudnn < 5.1.10 2017-04-07 16:27:16 -04:00
66a20e5c32 Support TORCH_NVCC_FLAGS environment variable
This is already supported in cutorch since august 2016, and is used in
pytorch integration (to reduce the binary size).
2017-04-07 18:23:22 +02:00
37d95687c4 Merge commit 'ae1c365dbdbf667ae24c57eec9f2e6b9debf16bd' 2017-04-06 16:37:31 -07:00
f0c7124420 Allow support for negative dimension argument for all functions 2017-04-06 16:37:00 -07:00
ae1c365dbd Add TH_INDEX_BASE to nDimension and stride functions 2017-04-06 16:30:11 -07:00
6fd9b53d93 Include common/linux.{h,cc} in CMake build
Summary:
Forgot to include these in a previous commit.
Closes https://github.com/facebookincubator/gloo/pull/23

Differential Revision: D4847072

Pulled By: pietern

fbshipit-source-id: 08aa9e8fa47377eb8c7747bd577eec7e615789f1
2017-04-06 15:20:59 -07:00
e692c38fcf Compute distance metric between PCI devices
Summary:
With this we can compute the best GPU device to reduce on. It is not
always the one CUDA indicates as GPU 0.

Reviewed By: andrewwdye

Differential Revision: D4845581

fbshipit-source-id: 13e0500f54fd507899646f781a97c09abcd3b056
2017-04-06 13:50:07 -07:00
5dfa73702f Display runtime information in benchmark output
Summary:
This makes it easier to capture, compare, contrast results with
different parameters.

Reviewed By: andrewwdye

Differential Revision: D4843715

fbshipit-source-id: ba6916dcd5f8bcc615d6edce1a54657241357c31
2017-04-06 11:06:23 -07:00
95140094cb Use CudaStream as first class object
Summary:
Instead of having every CudaDevicePointer "own" a stream, this change
moves to using CudaStream as first class object. It was pretty clunky
to use the copy{To,From}* functions on the CUDA pointer classes to
copy stuff around. For example it was not clear whether the stream
belonging to the source or destination was used to execute the copy
on. There is no longer such ambiguity after this change.

To make this work the CudaBroadcastOneToAll algorithm was changed to
include the workspace template argument, but only has the
CudaHostWorkspace implementation. The CudaDeviceWorkspace
implementation is left to be done for another change (that's not the
purpose of this change).

Reviewed By: andrewwdye

Differential Revision: D4841615

fbshipit-source-id: d0c1b9ba948ff6167832515afa7bdd2b32b48064
2017-04-06 11:06:23 -07:00
ef95926103 Move setTimeout to Device and set default tcp timeout to 30 sec
Summary: Make timeout a device attribute. Now the pair will configure timeout when connecting based on device timeout settings, instead of needing to be set explicitly on each pair. Set default tcp timeout to 30 sec.

Reviewed By: pietern

Differential Revision: D4838918

fbshipit-source-id: e6e6ee36c662eb5e7ba5354c904e50f9dcac258f
2017-04-06 08:50:21 -07:00
e7f5220dfa device_ids can be None again in data_parallel (#1187) 2017-04-06 10:30:53 -04:00
a7ae04a657 fix precedence problem when building with debug python (#1201) 2017-04-06 10:30:16 -04:00
7f03182bfa sizeAverage -> size_average in docs 2017-04-06 01:31:02 -04:00
9f2a5d804d Add a flag to fix when dataset size is not divisible by batch size. (#1133) 2017-04-06 00:18:43 -04:00
aa506fa4d7 fix docs typo 2017-04-05 23:42:02 -04:00
955869a09a fix cuda_allreduce_halving_doubling to correctly copy between and reduce on GPU buffers
Summary: cuda_allreduce_halving_doubling was not properly handling the case where buffers are allocated in GPU memory, trying to reduce and copy from them as if they were in system memory.

Reviewed By: pietern

Differential Revision: D4840259

fbshipit-source-id: 2615360cd2f1d9c7a37fb0bcdf33ff35528b2c75
2017-04-05 19:56:20 -07:00
d82cad3019 implement nn.Module.__dir__ (#1142) 2017-04-05 22:18:34 -04:00
9504246c32 add triplet margin loss (#1165) 2017-04-05 22:17:58 -04:00
81cf3dbf79 Merge commit '6bd4ecd15390517c68d598d236ffb0929ade277c' 2017-04-05 19:07:01 -07:00
12f1b4f76c Merge commit '84bdbe5ab4b602b021ff494487c8ad57457052d3' 2017-04-05 19:06:14 -07:00
84bdbe5ab4 btrisolve: Add sz checks, correct B's ordering, support nrhs>1. 2017-04-05 19:05:20 -07:00
85954032d9 fix doc formatting 2017-04-05 22:02:29 -04:00
1a04b92226 add note regarding SGD momentum 2017-04-05 20:45:41 -04:00
8a822d48f5 Update README.md
Summary:
Clarify that Redis Cluster is not supported. Also see #21.
Closes https://github.com/facebookincubator/gloo/pull/22

Differential Revision: D4837375

Pulled By: pietern

fbshipit-source-id: 6e3575b3b8dae6ca62beb765da15d8506da4abdb
2017-04-05 13:06:48 -07:00
5511ad258b cuda version of recursive halving/doubling allreduce
Summary: Basic port of the CPU halving/doubling algorithm. No pipelining is done between reduce/broadcast and communication.

Reviewed By: pietern

Differential Revision: D4823693

fbshipit-source-id: b18045d64edf90361bf7713f4ccb2e074757780f
2017-04-05 12:39:16 -07:00
75a635630d Update to ignore zero targets
If the target is zero, loss and gradient of input are set to zero. It
is useful for variable-length natural language generation models.
2017-04-05 11:51:54 -07:00
8e6524938b Undo D4832492 for Gloo
Summary: No folly dependency in Gloo.

Reviewed By: andrewwdye

Differential Revision: D4835050

fbshipit-source-id: 97d0c14fb770fdde68206ca5a20a974bef156392
2017-04-05 09:51:05 -07:00
4e4cfd8b2b Fix main()s to call folly::init/initFacebook/registrationComplete (part 14)
Summary:
Required for D4821763
Based on targets from https://fb.facebook.com/groups/fbcode/permalink/1304073246296178/ (I also excluded those targets which do not depend on folly:singleton).

Reviewed By: meyering

Differential Revision: D4832492

fbshipit-source-id: fcb4ce42e9e5359d4752769f77d7271e550201fe
2017-04-04 20:50:47 -07:00
6bd4ecd153 Use thrust::inclusive_scan for 1D cumsum/cumprod (#742)
For large 1D tensors thrust::inclusive_scan is much faster than our
current implementation.
2017-04-04 21:05:10 -04:00
5c802c5ba9 Refactor AllgatherRing to use remote buffer offset
Summary: Refactor AllgatherRing algorithm to remove all memcpy in the communication rounds by using outPtrs as send/receive buffer + remote buffer offset.

Reviewed By: pietern

Differential Revision: D4793186

fbshipit-source-id: 645d0758d246fd0b493e3fe312a8441d86f6d169
2017-04-04 17:08:26 -07:00
04f5b5ea83 Merge commit '5b40e4245d573ae0a6c2da70a0b712528aab2bce' 2017-04-04 15:39:35 -07:00
5b40e4245d Fix typo and make btrisolve work for doubles on the CPU. 2017-04-04 18:29:30 -04:00
ae5865082c Move common algorithm stuff into algorithm.h
Summary:
Combines the top level common.h with algorithm.h. With algorithm.h in
the common package, CUDA algorithms only need a dependency on that
package. CudaBroadcastOneToAll still depended on broadcast.h so this
change also removes that dependency and has it subclass the Algorithm
class.

Reviewed By: andrewwdye

Differential Revision: D4826885

fbshipit-source-id: 930037e39f7a2c941868e53f0bbc54e3f2e0b184
2017-04-04 13:05:50 -07:00
f86beccc5b Use workspace pattern with CudaAllreduceRingChunked
Summary:
GPUDirect support for CudaAllreduceRingChunked by adding a workspace
template parameter and adding workspace specific init functions.

To support this change the CUDA LocalOp classes had to be changed a
bit to take an extra destination/source pointer. This allows reduction
of 1-N pointers into a target pointer, where the target may live on
device or live on host. If it lives on the host, the NCCL operation
that executes the reduction is followed by a D-to-H memory copy. If
there is only a single input pointer, no reduction needs to happen and
the class just executes the D-to-H memory copy. The net result is that
we can interchangeably use device or host pointers as target for
reduction or source for broadcast and these LocalOp what you would
expect them to do.

Reviewed By: andrewwdye

Differential Revision: D4825236

fbshipit-source-id: 048ec6cbc5a0500bafbe1b3f6abe1e2e5f3a2675
2017-04-04 13:05:50 -07:00
d122b4e4ec Update btrisolve docs to the newest interface. 2017-04-04 15:21:16 -04:00
ccfc4567dc Merge pull request #78 from ilya-biryukov/master
Fix compilation error when compiling with 'clang -x cuda'.
2017-04-04 09:47:52 -07:00
81008aa111 Handle errors in sync IO path.
Summary: Fixes for handling errors and timeouts in blocking and polling sync paths. Add test coverage for errors and timeouts.

Reviewed By: pietern

Differential Revision: D4823498

fbshipit-source-id: 93721947a6404ca9cea6a4869f4156f8d270a981
2017-04-04 09:37:33 -07:00
0cdf10478d Start benchmark element sweep at 100
Summary:
Anything number of elements below this always fits in a single packet
and will yield ~identical results.

Differential Revision: D4825190

fbshipit-source-id: 71ac77456049e991da5059d5a029c5e9d2a67ed7
2017-04-03 23:50:38 -07:00
4de82cfa0f Use CudaAllreduceRing<CudaDeviceWorkspace> for GPUDirect
Summary:
The existing CudaAllreduceRing with a CudaDeviceWorkspace
template parameter now has the same effect.

Reviewed By: andrewwdye

Differential Revision: D4823393

fbshipit-source-id: 88fe497a983b26a281a3a74fe3bdc02c0c87c523
2017-04-03 20:05:25 -07:00
1ac8251373 Use gloo::make_unique to fix build for C++11
Summary: Closes https://github.com/facebookincubator/gloo/pull/20

Differential Revision: D4820325

Pulled By: pietern

fbshipit-source-id: 00a870f71e8e98ce6d06da261dcaed83b81ec81c
2017-04-03 17:07:04 -07:00
511ca3ea1b Add tests for tcp transport failures
Summary:
Implement a file store for multi-process transport failure testing. Add test cases to spawn multi-process tcp communication, and verify that all processes throw the expected IoException.

A future diff will add coverage for connectivity failures, sync modes, and ibverbs.

Reviewed By: pietern

Differential Revision: D4807794

fbshipit-source-id: 35212719d46e6d875eacb341fae25681f39053bc
2017-04-03 16:08:39 -07:00
8ce1382e99 make it compile on Windows + use ilp64 MKL (#981) 2017-04-03 18:02:15 -04:00
22cdef3ddc recursive halving/doubling allreduce
Summary:
Allreduce using recursive halving and doubling algorithm. Algorithm is described in http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf (see top diagram on page 12). Algorithm consists of 2 lg P stages, the first log P performing a reduce-scatter and the second log P the allgather. Message size is variable across steps. The early stages of the reduce-scatter and the late stages of allgather send the largest messages. The communication is structured such that the largest messages are sent between nearby ranks, which could be useful if elements are ranked in locality-aware fashion.

So far this supports only power-of-two number of processing elements.

I have attempted to minimize the amount of synchronization/ hand-shaking. Messages are received at different offsets of the output buffer for each communication step. Send offsets in the reduce-scatter steps become receive offsets in the allgather and vice versa. The reuse of buffers across reduce-scatter and allgather steps requires synchronization. Right now the algorithm is inefficient in terms of memory use, requiring 3x memory currently. This can be reduced, but would require additional synchronization.

Reviewed By: pietern

Differential Revision: D4795878

fbshipit-source-id: fcc6597ef6a99cd102fce2b8e4562d93088d39dc
2017-04-03 14:05:44 -07:00
148b11847b Remove useless base class in allreduce.h
Summary:
Didn't provide enough value now that ReductionFunction and
CudaReductionFunction are no longer related.

Reviewed By: andrewwdye

Differential Revision: D4819295

fbshipit-source-id: e6479769af7f78d486bee7d9c31f049430cdc775
2017-04-03 11:09:50 -07:00
b3a2f30715 Extra workspace template parameter for CUDA algorithm
Summary:
To bring the GPUDirect and non-GPUDirect implementations of CUDA aware
algorithms closer together this change introduces CUDA workspaces.
There's an implementation for a host side workspace and a device side
workspace. The former is used for transports that don't support
GPUDirect and the latter for ones that do. CUDA algorithms will take
an extra template parameter for this workspace and this will determine
whether they can be used for GPUDirect or not.

The workspaces only define their respective pointer types right now
but may contain local operation construction functions at a later
point in time.

Reviewed By: andrewwdye

Differential Revision: D4802826

fbshipit-source-id: cb1d71a224ce0165afd07fb9092ad54d3e07c8cf
2017-04-03 11:09:50 -07:00
91c4ba7980 Add torch.arange and deprecate torch.range 2017-04-03 10:38:58 -04:00
03f1cab801 Unify argument names in norm and renorm 2017-04-03 10:38:58 -04:00
fa2c566353 Add Variable.type_as 2017-04-03 10:38:58 -04:00
2d1122739c Raise AttributeError in Module.__getattr__ 2017-04-03 10:38:58 -04:00
7861f585fe Reshape grad in dot 2017-04-03 10:38:58 -04:00
3abf2ef225 Merge pull request #991 from BTNC/win
add /arch:AVX /arch:AVX2 explicitly for msvc so it compiles on windows
2017-04-02 13:32:57 -04:00
70c4b82eba add /arch:AVX /arch:AVX2 explicitly for msvc 2017-04-02 20:47:29 +08:00
274b5c9003 Allow unhashable inputs to parallel_apply 2017-04-01 20:11:20 +02:00
dfa2d26830 * make random_ range correct when both lower and upper are specified 2017-03-31 15:37:24 -04:00
559ae078b8 Fix Option constructor in invalid argument error printing code (#1160) 2017-03-31 15:35:35 -04:00
030ff4928a Merge commit 'a216e377b3844ac9c7882bd391a00f4e0ae718e7' 2017-03-31 11:45:37 -07:00
0829bffdec Merge commit '403cad46dc91a2bc2f6889754055decd6f3d53c7' 2017-03-31 11:45:24 -07:00
ffc7911bec Merge commit 'd8ae7893e056ebf4e7a5e96bab2c3b69f196ddfd' 2017-03-31 11:45:06 -07:00
ff1fde6151 Merge commit 'a3bfb9f376a57fb63e89ddf70f57353f19ed9d69' 2017-03-31 11:44:48 -07:00
a216e377b3 Merge pull request #456 from twitter-forks/addmm-fixes
Using temporary variables when performing transpose + addmm
2017-03-31 14:44:07 -04:00
b13b7010b9 check for nvidia driver's sufficiency before checking for number of CUDA devices (#1156) 2017-03-31 12:19:59 -04:00
a3bfb9f376 THVector_(add),(mul) -> (adds),(mul) for VSX.
This was previously completed for other architectures.
2017-03-31 08:50:23 -07:00
5c79046d39 Use persistent tensor to store exp_inf (part of optimizer's state) (#1152) 2017-03-31 10:30:31 -04:00
30fd222b80 implement autograd function cross (#1138) 2017-03-31 01:45:51 -04:00
3b7b23df66 Move CUDA collectives to cuda_collectives.h
Summary:
The CUDA algorithms all had their own version of local reduction and
broadcast. This commit consolidates them and allows all CUDA
algorithms to work with CudaDevicePointer instances.

Reviewed By: andrewwdye

Differential Revision: D4797968

fbshipit-source-id: cccef39fce01905a2cd757ccbcffd29803411409
2017-03-30 15:06:03 -07:00
d933287114 Add a barrier after verification iteration in benchmarks to prevent a race with regular iterations
Summary: Verification was sometimes failing for allreduce halving-doubling. Pieter noticed that it is due to verification step racing with the regular iterations.

Reviewed By: pietern

Differential Revision: D4804558

fbshipit-source-id: f645cb2e332e449a993a634c5bdb42c2dcb8613b
2017-03-30 14:14:32 -07:00
761eef1f19 Minor typo fix in backward function in torch/autograd/variable.py (#1143) 2017-03-30 11:23:28 -04:00
d8ae7893e0 Get rid of warp-synchronous code (#739)
Time to get rid of warp-synchronous code. It will break!
2017-03-30 01:20:43 -04:00
90b872c670 Add GPUDirect capable version of CudaAllreduceRing
Summary:
This is a copy of CudaAllreduceRing that doesn't stage the locally
reduced buffer in host memory but uses the GPU side buffers directly.

Eventually I would like this to be absorbed back into
CudaAllreduceRing, but for now it's a good place to compare the two
implementations and abstract the parts that make sense, until they are
identical again.

Reviewed By: andrewwdye

Differential Revision: D4791629

fbshipit-source-id: 5ad065cb94adb968aeee2379327be313638f2161
2017-03-29 18:50:11 -07:00
a95ce9e98f Using temporary variables when performing transpose + addmm 2017-03-29 16:56:39 -07:00
403cad46dc Using temporary variables when performing transpose + addmm 2017-03-29 16:14:13 -07:00
b8ccf42c74 Constify algorithm constructors
Summary: TSIA

Reviewed By: gchanan

Differential Revision: D4795492

fbshipit-source-id: aaad7afd373e40fa4669129cf2c98594c4091153
2017-03-29 14:21:03 -07:00
8aa1cefed8 Fix deadlock in autograd (#1140) 2017-03-29 16:19:40 -04:00
4b147e2079 Settable timeout for tcp read/write
Summary: Add a setTimeout() API to the Pair interface. Implement in the tcp transport for connect, read, and write, and across blocking, polling, and async configurations. Ibverbs implementation to come later.

Reviewed By: pietern

Differential Revision: D4787932

fbshipit-source-id: 6072dc0c0add1700f84a72b83e4388b29b044ec1
2017-03-29 09:07:04 -07:00
0d908d813b Implements Cumsum function for autograd (#1122) 2017-03-29 17:45:57 +02:00
1c391f6f93 bump version 2017-03-29 10:08:34 -04:00
be146fd721 Add btriunpack and update the btrifact test. 2017-03-29 13:42:13 +02:00
2979f4b989 add more functions to docs 2017-03-29 01:29:17 -04:00
22b3600f19 add samplers to documentation 2017-03-29 00:33:07 -04:00
215813d7ac Change dockerfile to support for cudnn v6 (#1135) 2017-03-28 20:05:04 -04:00
80e88a88ed Fix ibverbs completion queue capacity
Summary:
The header already contained an analysis of required completion queue
depth but the queue pair was still initialized with a maximum queue
depth of kMaxBuffers. This change fixes that and updates the analysis
to talk separately about receive and send completion queues.

Reviewed By: andrewwdye

Differential Revision: D4785786

fbshipit-source-id: 4dc302d523a3b7162dc261d14cfcc755681febf8
2017-03-28 10:06:50 -07:00
dc7695a47a Update links for tutorials in README (#1123) 2017-03-28 14:21:40 +02:00
032a65edff modify pip uninstall command in CONTRIBUTING.md 2017-03-28 14:20:49 +02:00
55546359b6 Retry on EINTR for writev in tcp/pair.cc
Summary: TSIA

Differential Revision: D4783319

fbshipit-source-id: 610d1a65a54048e7c56610632ccfe271eac85b6c
2017-03-27 17:35:45 -07:00
fe3d5a63f2 Support multiple predefined reduction functions
Summary:
Predefining the reduction functions makes it easy to provide a set of
fast implementations. Eigen is used to implement them if it is found.

Reviewed By: andrewwdye

Differential Revision: D4780868

fbshipit-source-id: e825cf2e5cfe8ec27d587c5aff4002534b1c670d
2017-03-27 14:35:02 -07:00
e4b4e515cd add mode to cwrap 2017-03-27 13:29:14 -07:00
4b1f5f4bd6 Merge commit 'afd576ec0e389db3e47efe44652c488b1706f168' 2017-03-27 13:26:50 -07:00
37718e207d Add remote offset argument to buffer send
Summary: This makes it possible to write to any offset in a remote buffer.

Reviewed By: andrewwdye

Differential Revision: D4779776

fbshipit-source-id: f5a44cc705df5141bd720ff4e3fec8697f707a70
2017-03-27 13:07:17 -07:00
afd576ec0e Add mode kernel 2017-03-27 15:58:47 -04:00
95aa2af377 btrisolve: Make a Tensor method and update argument order
Also update docs for btrifact and btrisolve to the newest interface.
2017-03-27 15:46:49 -04:00
6774d39c96 Merge commit '5d274cd4991022d63b014cc8917e00c15441d3f4' 2017-03-27 11:54:08 -07:00
567faedc59 Merge commit '8051dec608368fed3569c7513292785083adc53c' 2017-03-27 11:53:41 -07:00
7c2c7e8e31 Move NCCL code to subdirectory and backfill ops
Summary:
All operations supported by NCCL are now available through the Gloo
wrappers. Algorithm wrappers for them are forthcoming so that they
can be used interchangeably with other implementations.

Since not all of them require same-sized source and destination
pointers, I moved assertions on number of elements to the op
constructors.

Reviewed By: andrewwdye

Differential Revision: D4771292

fbshipit-source-id: 2f34629507b5e1cb9ae8d6d2f02de0a7f641a341
2017-03-27 09:50:40 -07:00
3eab8a71e2 Added docstring to add_module (#1116) 2017-03-27 11:09:24 -04:00
2fd4d088ff add Adaptive pooling methods to docs 2017-03-26 22:43:46 -04:00
5d274cd499 Update btrisolve argument order. 2017-03-26 13:07:24 -04:00
8051dec608 Update btrisolve argument order. 2017-03-26 13:06:34 -04:00
f2c1071c33 Adaptive max and average pooling (1D & 2D) (#1084) 2017-03-26 17:09:28 +02:00
bb71117ecc Cwrap arg assign (#1102) 2017-03-26 13:53:28 +02:00
d25433a099 Fix docker build commands (#1103) 2017-03-25 16:18:33 -04:00
7dd45490f8 don't use inplace backward, remove unnecessary zero for grad_input (#1079) 2017-03-25 20:04:48 +01:00
bf632544e6 Pass NULL rinfo_ to btrifact by default (#1089) 2017-03-24 19:49:40 -04:00
282402d4f3 Revert "Add back zero fill for ger" (#1093)
This reverts commit 5a761dbe65d2221e9c200b3f8ea0590b5d9b923f.
2017-03-24 19:49:31 -04:00
1461709ea0 Improving the performance of IndexLinear:updateOutput
- Removes separate kernel for updateOutputTrain
2017-03-24 16:34:31 -07:00
cce03074f5 Merge commit '3acbbb30f2bdc6ccf4ffb6f7d568e7916d4e384d' 2017-03-24 16:19:44 -07:00
f2f63773d8 Merge commit '52911f9e47f679045a238eb9dfdc5db55bf98cc9' 2017-03-24 16:19:19 -07:00
84aa41824c Merge commit 'b4fe5ad641181f30bdcc4749c949206a3ebb04b4' 2017-03-24 16:19:05 -07:00
25c8a117af Merge commit 'e8196f990db4ba368010f0d950bebf1fb13c2888' 2017-03-24 16:18:52 -07:00
ae122707b5 Don't do extra resize in linear bias 2017-03-24 23:41:15 +01:00
b4fe5ad641 Use zero instead of mul when beta == 0 in addr 2017-03-24 13:09:00 -07:00
5a761dbe65 Add back zero fill for ger
Ger does not have beta argument, so has to be zero-filled.
2017-03-24 21:03:02 +01:00
dd893391d5 Add argument to children to yield the name of the modules (#941) 2017-03-24 20:02:05 +01:00
649f04d077 Added Pascal nvcc flags, bumped version 2017-03-24 11:58:14 -07:00
f45ef5fdb8 AllGather algorithm [CPU]
Summary: Allgather ring CPU implementation. Its does |buffers| x |contextSize| passes.

Reviewed By: pietern

Differential Revision: D4723809

fbshipit-source-id: ffd8366ac7e1746555474e173143d33cee497822
2017-03-24 11:06:57 -07:00
e8196f990d Make rinfo_ argument optional in btrifact 2017-03-24 09:01:36 -07:00
269b77a1b2 Make rinfo_ optional in btrifact 2017-03-24 09:00:39 -07:00
476d85dd3f DataLoader: Fix batch data type for numpy array (#1074) 2017-03-24 11:34:24 -04:00
63f6c0d692 add Pairwise distance (#835) 2017-03-24 11:29:40 -04:00
b546fa3fcd add assertTrue to padding tests 2017-03-24 15:27:51 +01:00
1d656b6769 Ensure displayed progress in ProgressMonitor is between 0 and 100%.
Fixes #1086
2017-03-24 15:21:52 +01:00
3acbbb30f2 Fix inconsistent in-place and out-of-place for HardTanh
in-place and out-of-place updateGradOutput results are different where input=min_val or input=max_val
2017-03-23 17:27:29 -07:00
52911f9e47 Fix inconsistent in-place and out-of-place implementations
Currently in-place and out-of-place updateGradOutput will produce different results for input=max_val or input=min_val - in-place won't backprop gradient where input=max_val or input=min_val, out-of-place will backprop gradient in this case.
2017-03-23 17:22:55 -07:00
a65e0f488c Remove zero fill where not needed (#1077) 2017-03-23 19:44:00 -04:00
8dc5d2a22e export current_blas_handle 2017-03-23 23:32:45 +01:00
ed97f3f854 Adding support for flattened inputs for IndexLinear
- Adding relevant tests
2017-03-23 14:18:41 -07:00
a231fe8fc5 IndexLinear support for cunn 2017-03-23 14:18:01 -07:00
bb353ccc17 Add batch triangular factorization and solves, add IntegerTensor to cwrap (#903) 2017-03-23 15:06:00 -04:00
ced0054a9e Fix formula for stddevs grad in Normal function (#1076) 2017-03-23 14:32:34 -04:00
68ee5ede29 make inplace tests compare input grads 2017-03-23 18:54:00 +01:00
2966e3295d Make static/shared configurable and install optional
Summary:
This makes it possible to embed Gloo in a project without CMake
installing Gloo headers and/or libraries, or having a runtime
dependency (and statically link to it).

Also:
* Install benchmark tools
* Statically link to NCCL if the bundled version is used
Closes https://github.com/facebookincubator/gloo/pull/19

Differential Revision: D4762432

Pulled By: pietern

fbshipit-source-id: cf38903e6c51f2480fba4ff18cbdc0c9080df0c4
2017-03-23 09:06:37 -07:00
4df98e2927 Merge commit '3865606299b1fbcd0a94cef4a66c1bc007246da8' 2017-03-23 08:39:43 -07:00
6ccac5ce28 Merge commit 'd3334db6274d7a3cd07f20d583056e453dc8134d' 2017-03-23 08:39:30 -07:00
3865606299 adding batch triangular factorization and solves, add IntegerTensor to cwrap 2017-03-23 11:37:00 -04:00
d3334db627 adding batch triangular factorization and solves, add IntegerTensor to cwrap 2017-03-23 11:35:35 -04:00
50f5a4dd18 fix BCE loss formula visualization (#1072) 2017-03-23 11:27:21 -04:00
b60936b9ae fix NLLLoss2d documentation 2017-03-23 10:06:40 -04:00
2d750b9da5 fix typo 2017-03-23 09:40:06 -04:00
ca376d4584 implement autograd function trace 2017-03-23 10:37:52 +01:00
ef183a1d23 Merge commit '5cd313ed23a3b11ddd739bcfedaee6e310e4e438' 2017-03-22 19:25:46 -07:00
f4d8944973 fix OSX fread bug (#1068) 2017-03-22 22:06:14 -04:00
6b7aef63ac Added support for multidimensional tensors in PReLU; Channel number now in second dimension 2017-03-22 20:36:52 -04:00
b3ab4b1094 Check torch.backends.cudnn.enabled, padding, and output_padding (#996)
* Check torch.backends.cudnn.enabled
* Don't allow negative padding and output_padding values
2017-03-22 19:42:11 -04:00
1e8cb82a2d Break only after the update in L-BFGS 2017-03-22 18:58:42 -04:00
dd399a8d68 Return total param norm from clip_grad_norm 2017-03-22 18:58:42 -04:00
faac0f5c25 Fix torch.cat bugs
Always use PySequence API and disallow catting along inexistent
dimensions.
2017-03-22 18:58:42 -04:00
c36f47bd1e Make random_ exclusive and make generator kwarg only in all random
functions
2017-03-22 18:58:42 -04:00
3d1888cd95 Fix size mismatch in CosineEmbeddingLoss backward 2017-03-22 18:58:42 -04:00
97a82a3018 fix formatting in upsampling docs (#1067) 2017-03-22 18:06:31 -04:00
5cd313ed23 Fix TH_TENSOR_APPLYX_D in the case where the dimension of interest is the inner dimension 2017-03-22 13:15:01 -07:00
b414494035 Merge commit '714b2b8bf657afe41cc8503998b6d919339b8075' 2017-03-22 12:49:29 -07:00
c10efc646e Merge commit 'e17d84d38edf6094175deead555abbc96321b69f' 2017-03-22 12:49:11 -07:00
348531ad8d Merge commit '0056b0883426e38ffbd646c040b6c281d12673f2' 2017-03-22 12:48:57 -07:00
9d83121ef5 Don't add options to CUDA_NVCC_FLAGS if already set
Summary:
This may be the case when the Gloo CMake files are sources from a
parent project that has already imported CMake CUDA support. If these
checks are not performed then CUDA_NVCC_FLAGS might contain
conflicting options.

Verified this works while working on Gloo for Caffe2.
Closes https://github.com/facebookincubator/gloo/pull/18

Differential Revision: D4756179

Pulled By: pietern

fbshipit-source-id: 32fc39ec2322cce5899a2398ebbf8395d3917502
2017-03-22 12:35:04 -07:00
6d7cb31e53 MPI: Duplicate MPI_Comm and allreduce maxLength as MPI_ UNSIGNED_LONG.
Summary:
Some small MPI-related changes:
1) Instead of making an object copy of the MPI_Comm, call MPI_Comm_dup;
because the (passed-in) communicator is used later via the call to
connectFullMesh this guarantees that the communicator will not have been
freed by user before connectFullMesh is called.

2) Allreduce for maxLength is done on an unsigned long type; use the
corresponding MPI type.
Closes https://github.com/facebookincubator/gloo/pull/17

Differential Revision: D4754195

Pulled By: pietern

fbshipit-source-id: 863fd33c726f88120f8f5ee61964c3525babbf97
2017-03-22 09:26:00 -07:00
30a9cf7a46 Mark transport pair after IO error and propagate to calling threads
Summary:
This change solidifies IO error handling between threads and successive transport API calls. When an IO exception occurs, signal all buffers of the error, propagating the exception from the device thread or single user thread onto all user threads. Store the exception in the pair and check on future API calls or device events. Swallow all IO exceptions in the device loop.

Right now IO exceptions during portions of the listen/connect phase will result in an indefinite wait in the peer. I will address this with a configurable timeout (t16205269).

Reviewed By: pietern

Differential Revision: D4749248

fbshipit-source-id: c75ee3b20875d561bf84631e5384e28015dabad3
2017-03-22 09:06:24 -07:00
714b2b8bf6 Merge pull request #453 from apaszke/lookup_renorm
Cast accumulator in LookupTable renorm to accreal
2017-03-22 11:53:41 -04:00
fe4bd5066b Added support for multidimensional tensors in PReLU; Channel number now in second dimension 2017-03-22 11:45:02 -04:00
e17d84d38e Added support for multidimensional tensors in PReLU; Channel number now in second dimension 2017-03-22 11:44:28 -04:00
b9aef6bc03 Fixing default values for LR and Epsilon (#895)
It seems that the default values for LR and Epsilon (previously, 1E-2 and 1E-38 respectively) were different from the ones recommended by the authors (2E-3 and 1E-8, respectively). Other packages such as Keras (https://github.com/fchollet/keras/blob/master/keras/optimizers.py#L474) and Lasagne (https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py#L612) use the suggested values as well.
2017-03-22 11:34:39 -04:00
0056b08834 Narrow V when returning only some right singular vectors 2017-03-22 08:33:03 -07:00
bd0df61bb5 Cast accumulator in LookupTable renorm to accreal 2017-03-22 08:29:39 -07:00
d9678c2e34 Correct typo in batchnorm documentation 2017-03-22 13:55:45 +01:00
b3c0aa3b7d fix a typo in ffi doc (#1055) 2017-03-21 15:37:48 -05:00
8fc9c79287 Add nccl submodule 2017-03-21 17:53:58 +00:00
4fce1a389f Include CUDA support in CMake build
Summary:
* Pull in NCCL submodule
* Include (heavily modified) CUDA/NCCL build files from [Caffe2](https://github.com/caffe2/caffe2)
* Build CUDA enabled benchmark/test
* Enable CUDA build in Travis configuration
Closes https://github.com/facebookincubator/gloo/pull/16

Differential Revision: D4746784

Pulled By: pietern

fbshipit-source-id: b5c6cbcd8ac8b30c071851cdc7ae88c69c0ab4d6
2017-03-21 10:51:57 -07:00
8ce56c30d4 Convert runtime errors to gloo exceptions
Summary:
Bubble up gloo configuration and network errors as exceptions. The caller may be able to recover. Other unexpected failures continue to be handled as fatal with GLOO_ENFORCE

Modify ibverb API validation to check for != 0 instead of -1 to conform with API definition.

Still need to convert some errors in the rendezvous code and add documentation.

Will pass device loop errors onto the calling thread in a future diff

Reviewed By: pietern

Differential Revision: D4730362

fbshipit-source-id: c801adb353013e7f541ab01ac16a0cc71c1c36b2
2017-03-20 13:50:29 -07:00
4667f936e3 Add explicit dependency on pthreads
Summary:
Got linker errors on Ubuntu 16.04 (not on 14.04).
Adding the pthreads dependency explicitly fixes it.
Closes https://github.com/facebookincubator/gloo/pull/15

Differential Revision: D4739081

Pulled By: pietern

fbshipit-source-id: 6bae7d361d934e93560d28a76c3dca4a4236f113
2017-03-20 11:52:41 -07:00
4eaa30b634 Build tweaks
Summary:
* Mention submodules in README
* Remove fetch.sh from third-party directory
* Rename benchmark/test build targets
Closes https://github.com/facebookincubator/gloo/pull/14

Differential Revision: D4739077

Pulled By: pietern

fbshipit-source-id: 859c1cac0c0163870eae8f18e4e2f177a6bc8890
2017-03-20 11:35:19 -07:00
77fbc12f23 Fix some deadlocks when torch_shm_manager is not found (#1030)
- Add additional timeouts to test_multiprocessing to reduce chances of
   hanging indefintely on failure
 - Add missing header guards
 - Fix typo
 - Check that torch_shm_manager exists in torch/__init__.py
2017-03-17 18:28:39 -04:00
7e46eb1613 Fixes for Prod and Expand functions (#1026)
Thanks to @ChangYong-Oh for the original implementation.
2017-03-17 18:24:44 -04:00
821656d2d8 add CONTRIBUTING document 2017-03-17 07:59:37 -04:00
86e40ed875 Fix a typo in docs about pinned memory buffers (#1023)
* remove misleading guide for BCELoss

* fix docs about pinned memory buffers
2017-03-17 05:08:03 -04:00
1d0699e147 Define exception hierarchy
Summary: Define an exception hierarchy for gloo runtime errors. Keep GLOO_ENFORCE macros for assertions.

Reviewed By: pietern

Differential Revision: D4724124

fbshipit-source-id: 22f0581b06524579e86fe335770bdb620d20e258
2017-03-16 15:08:01 -07:00
b9379cfab7 Use cuDNN and NCCL symbols from _C library (#1017)
This ensures that we use the same library at the C++ level and with
Python ctypes. It moves the searching for the correct library from
run-time to compile-time.
2017-03-16 16:10:17 -04:00
f0b75c4aa4 Merge pull request #729 from shenxiul/cuda_linspace
linspace and logspace for CUDA Tensors
2017-03-16 14:03:00 -04:00
7654b3f49e Add function to compute cross_entropy for 2D image (#802) 2017-03-16 17:34:04 +01:00
37ebbc2809 the length of any item in padded_sequence should be greater than 0 (#1013) 2017-03-16 17:32:43 +01:00
8241cd7b6e Fix compilation error when compiling with 'clang -x cuda'.
Functions vFetch and vStore are not found by ADL with clang,
so they need to be declared before usage in ReduceCopy.
2017-03-16 12:01:11 +01:00
a7781fdebc Use default Redis port in RedisStore constructor
Summary: TSIA

Reviewed By: andrewwdye

Differential Revision: D4718573

fbshipit-source-id: c0b9aa78cf1f4db910526841c0172537b9243f7e
2017-03-15 22:19:51 -07:00
29ddbc3e37 implement linspace, logspace and range in CUDA 2017-03-15 20:50:30 -07:00
16a133ed9a Fixes for testing on FB infra (#1009)
- make each test in test_autograd have a unique name ignoring case
 - assemble all tests when test_legacy_nn is imported
 - import Python.h in PtrWrapper.h
2017-03-15 18:37:11 -04:00
1aa665f6a8 Documentation
Summary:
* Add separate file for rendezvous docs
* Mention using MPI for rendezvous
* Fix algorithm docs formatting
Closes https://github.com/facebookincubator/gloo/pull/13

Differential Revision: D4715442

Pulled By: pietern

fbshipit-source-id: 0469ab8d16fd489a38c399ec2b25860d1225ce72
2017-03-15 14:58:51 -07:00
c4d1318662 Fix map_location in torch.load (#1006) 2017-03-15 16:54:19 -04:00
379ae6d865 Refactor out dispatchStateless (#1007)
Some of the error messages were incorrect due to erroneous
'tensor == THPDefaultTensorClass' checks
2017-03-15 16:24:55 -04:00
24376ff9d3 Merge pull request #723 from killeent/scan-primitive
add implementation of inclusive scan via upsweep-downsweep
2017-03-15 14:37:21 -04:00
6ac793dcbe Reuse ncclComm_t across algorithm instances
Summary: Initializing ncclComm_t is expensive. Allocate a set of ncclComm_t for each unique device set and cache for reuse. With this change the CudaAllreduceChunked tests runtime improved from ~170 sec -> ~10 sec on my machine. There is no improvement in the benchmark numbers because the algorithm instance is only allocated once.

Reviewed By: pietern

Differential Revision: D4708943

fbshipit-source-id: 85b85070586d6683a762b8282df593ca831e7bc7
2017-03-15 09:51:43 -07:00
e00d9c1fd8 Execute benchmark through mpirun
Summary:
This change includes CMake changes to compile the MPI assets when the USE_MPI flag is enabled. If so, the benchmark tool can now be launched through mpirun.

Includes the changes done in #11.
Closes https://github.com/facebookincubator/gloo/pull/12

Reviewed By: Yangqing

Differential Revision: D4712060

Pulled By: pietern

fbshipit-source-id: 0d0e93882f5822583f59304d4256dbdf5dea7483
2017-03-15 08:21:12 -07:00
be6322e4b5 Update nn.init docstrings to correctly reference the module (#1001) 2017-03-15 11:17:59 -04:00
62063b2f62 Fix docs for pointwise ops (#845) (#985)
* add torch.nn.init docs to the source folder
2017-03-15 11:08:05 -04:00
13b1580613 add F.pad to docs 2017-03-15 00:09:14 -04:00
fe788f5003 Use correct event to synchronize destination buffer in NCCLElement
Summary: NCCLOp::runNCCL is mistakenly recording an event in the source pointer after the NCCL op. This results in NCCLOp::wait() returning without synchronizing with the output buffer. The synchronous tests using NCCL fail.

Reviewed By: pietern

Differential Revision: D4708860

fbshipit-source-id: 0c36511e260b587d410e5c9604552ceedd06d988
2017-03-14 19:20:59 -07:00
e50a1f19b3 Use streams in scatter to overlap copy with compute 2017-03-14 22:46:07 +01:00
e86db387ba Fix conv1d backward segfault (#999) 2017-03-14 16:15:53 -04:00
1bf61b8adc Add googletest submodule 2017-03-14 03:39:54 +00:00
704ee3ca68 Use cudart symbols from the main program.
Our extension library links against cudart and pulls in the symbols. Use
LoadLibrary(None) to use the same symbols as the _C extension.

This fixes the PyTorch wheel when you don't have system CUDA installed.
2017-03-13 19:45:34 -04:00
9004652c7b updated the documentation to remove the unnecessary copy grads when using multiprocessing 2017-03-13 19:04:17 -04:00
aca6ce984c change lookup table sort 2017-03-13 13:55:16 -07:00
ed8773f7bd add legacy_serialized.pt to gitignore 2017-03-13 16:37:35 -04:00
0f7b7b27b1 Fix build for CMake 2.8.12
Summary:
This is the minimum required CMake version (also the version that is available on Ubuntu Trusty (14.04)).
Closes https://github.com/facebookincubator/gloo/pull/9

Reviewed By: Yangqing

Differential Revision: D4698659

Pulled By: pietern

fbshipit-source-id: bf01541fe485c03e7c665f175c2887feaf9516a3
2017-03-13 13:06:15 -07:00
48f48b6ff2 fix more flaky VolumetricMaxPooling tests 2017-03-13 14:38:27 -04:00
615b27eadf fix corner case in SetItem of Variable 2017-03-13 14:38:27 -04:00
86ede33035 CMake improvements for Gloo
Summary: Install headers and add .. to include directories

Reviewed By: pietern

Differential Revision: D4695500

fbshipit-source-id: f48a49f03e575408829793cb63bfdb16d8e3a309
2017-03-13 11:06:05 -07:00
bd09055207 Synchronize all NCCL ops with shared per-device streams
Summary:
Allocate a set of per-device streams used to serialize NCCL op scheduling. These ensure concurrent NCCL ops are not interleaved across devices (i.e., through priority scheduling), resulting in deadlock.

Synchronize source and destination streams with NCCL streams.

Reviewed By: pietern

Differential Revision: D4685360

fbshipit-source-id: 3c228b195b0a0d9d7cccc720163898d344a5ed4c
2017-03-13 09:20:05 -07:00
4bd220d91a Travis contbuild scripts and cmake fix.
Summary:
TSIA. Redoing #7 to kick travis.
Closes https://github.com/facebookincubator/gloo/pull/8

Reviewed By: Yangqing

Differential Revision: D4697132

Pulled By: pietern

fbshipit-source-id: d03148aeddb2cf927b4ef3689c97d9ba4f4cdc9d
2017-03-13 08:36:10 -07:00
170d790b66 fix doc of conv3d in conv.py (#989)
the second dimension should be height.
2017-03-13 11:30:13 -04:00
e216f557fd Fixes issue returning strings from a Dataloader with pin_memory=True (#908) 2017-03-13 10:11:07 +01:00
997312c233 Add WeightedRandomSampler (#980)
Samples elements from `[0,..,len(weights)-1]` with given probabilities (weights). So far there is no mean to either introduce sample weights in loss functions or while sampling from a dataset. This is an attempt to add the functionality for the latter issue.
2017-03-13 00:27:05 -04:00
d602b3a834 Allow submodules and parameters to shadow attrs on assignment 2017-03-12 13:31:32 -04:00
f531d98341 Fix memory leak in torch.from_numpy 2017-03-12 13:31:32 -04:00
6bdd5ecaf5 Remove some unnecessary AutoGPU calls 2017-03-12 13:31:32 -04:00
bfbde9d6eb Fix Embedding bug when max_norm was used 2017-03-12 13:31:32 -04:00
b9c816a796 Fix run_test.sh --coverage option. (#983) 2017-03-11 19:26:02 -05:00
2f5c215d34 Update setup.py (#981)
Adding `description` to `setup.py`
2017-03-11 12:14:07 -05:00
01650ac9de add torch.nn.init docs to the source folder (#979) 2017-03-11 10:11:30 -05:00
ce536aa355 fix example in docs for NLLLoss 2017-03-10 16:48:08 -05:00
fc0af33a18 key only block-wide bitonic sort 2017-03-10 11:50:43 -08:00
c7c4778af6 modify docs of broadcast to fix issuse #940 (#970) 2017-03-10 09:54:43 -05:00
d873077349 Create context from existing MPI communicator
Summary:
This makes it easy to use Gloo transports and algorithms in existing
MPI environments.

Reviewed By: andrewwdye

Differential Revision: D4685999

fbshipit-source-id: cfc7d0e445893512b4e4ed2abe1bb280d83b9c70
2017-03-09 23:06:18 -08:00
0c38827318 Split out rendezvous specifics from context
Summary:
How pairs are setup and connected to one another is specific to
whatever underlying rendezvous mechanism is used. This change moves
the `connectFullMesh` function into a subclass in the `rendezvous`
directory. This prepares for a separate MPI context that can setup
pairs between processes using an existing MPI communicator.

Reviewed By: andrewwdye

Differential Revision: D4684755

fbshipit-source-id: 9eb643b8ba545b3e6f9a36b65642b3b04a5f0077
2017-03-09 23:06:18 -08:00
fb766c00b3 Align async\wait pattern to use wait() naming
Summary: TSIA

Reviewed By: pietern

Differential Revision: D4686783

fbshipit-source-id: ccbdace0d53219bd4b881ea27f7f972b206215b6
2017-03-09 21:20:45 -08:00
e600c9830a Fix up NCCLElement construction in CudaBroadcastOneToAll
Summary: TSIA

Reviewed By: pietern

Differential Revision: D4686520

fbshipit-source-id: 657ca90aa1971be152b037563105a9f490137a69
2017-03-09 20:37:03 -08:00
73a65cd29f simple ordering fix to avoid gcc warning 2017-03-09 17:10:59 -08:00
b785ed0ac0 Fix Embedding and CosineEmbeddingLoss on non-float CUDA (#965) 2017-03-09 18:04:40 -05:00
b2d077d81d Update _tensor_docs.py (#966) 2017-03-09 18:04:19 -05:00
4814b0bc09 Recompose NCCLElement of src/dst CudaDevicePointers
Summary: CudaDevicePointer has the information we need for a NCCL op. Refactor NCCLElement as a composition of src and dst CudaDevicePointers. This allows for separate streams for src and dst, and will simplify a future change to use a static set of streams for all NCCL ops.

Reviewed By: pietern

Differential Revision: D4679483

fbshipit-source-id: 75656cc2fa5b5e2a6c096d914d2111769a47291b
2017-03-09 12:26:55 -08:00
b1c2714ad5 Add momentum and centered options to RMSProp (#810)
* add momentum and centered options

Add two options :
 - Momentum (like SGD's momentum)
- Centered RMSprop, as in Graves 2013 ( https://arxiv.org/abs/1308.0850 ) : grad is normalized by running estimation of its variance

* somme PEP8

* bug in default

* bug2

* sign mistake

* alloc of momentum & centered only if needed

* add link to docstring

* some pep8 on docstring

* implement __setstate__() for backward compatibilty

* correct grammar mistake

* multiply by lr when adding delta to params

* rename momentum variables

* change __init__ params order
2017-03-09 10:04:32 +01:00
a462edd0f6 Docs(RNN|GRU|LSTM): Note dropout applies to all layers *except* the last layer (#961)
This is an important clarification to make as otherwise users are misled as to where they may need to add dropout and to clarify the situation would need to delve into the backend implementation. 
4647f753bc/torch/nn/_functions/rnn.py (L73)
2017-03-08 18:09:11 -05:00
c2425fc9a1 Fix build warning for C file 2017-03-08 21:28:57 +01:00
fbcedf2da2 Merge commit '3d95e13b332e1b31d706b59c3b67f886958ece79' 2017-03-08 09:09:46 -08:00
3d95e13b33 Check event_count before merging blocks 2017-03-08 08:49:04 -08:00
228e1a8696 Add CUDA caching allocator accessor 2017-03-08 08:29:50 -08:00
be0e8c0009 Use sequential slot numbers from context
Summary:
Add a nextSlot() function to the context that increments and
returns a slot number. This enables multiple algorithms sharing the
pairs part of a context. The slot numbers were hardcoded before this
change, which prevented reuse.

After this change, some of the tests can be changed to run multiple
times (or do a parameter sweep) without respawning a new threadpool or
allocating new fixtures.

Also change some internally used variable names for more consistency.

Reviewed By: andrewwdye

Differential Revision: D4668268

fbshipit-source-id: 65cbc8f2666f0b7d2f1c72574b86d913f5855d62
2017-03-08 08:23:03 -08:00
3fa8a3ff46 add implementation of inclusive scan via upsweep-downsweep 2017-03-08 07:34:14 -08:00
4647f753bc Merge commit '0f872ed02fbaf5b326f235b3f18724171b061416' 2017-03-07 14:45:01 -08:00
7ba5e7cea1 fix VolumetricMaxPooling test instability (#952) 2017-03-07 10:55:46 -05:00
9b626a8047 Fix documentation - replace 'matrix' with 'vector' (#951) 2017-03-07 10:40:18 -05:00
bd0e9a73c7 Fix some simple build error on MacOS (#949)
Issue #948

Signed-off-by: Zhou Chang <achang.zhou@gmail.com>
2017-03-07 09:47:49 -05:00
7bddd586f7 Change PrefixStore to take a Store reference
Summary:
Taking ownership of a std::unique_ptr is a bit awkward. It's actually
useful to reuse the underlying store and create multiple prefix stores
against it.

Reviewed By: andrewwdye

Differential Revision: D4662354

fbshipit-source-id: eaf62f7d5a97d6ee848252ff3124c28da349f6f2
2017-03-06 22:19:49 -08:00
da10450535 Allow multiple input pointers to broadcast algorithms
Summary:
This changes the constructor prototype of the broadcast algorithms.
They now take the rank of the root process and the rank of the root
pointer. The root process now also broadcasts locally, among the
specified pointers, in addition to broadcasting to its peer processes.

The broadcast tests are made more robust to use a different value at
every index for every buffer, like the allreduce tests. To accomodate
multiple input buffers for CPU side algorithms, I added a Fixture
helper, and renamed the existing Fixture class to CudaFixture.

The broadcast tests contain a few TODOs since they don't vary the root
process or root pointer yet. I anecdotally verified this does work,
but didn't want to include the necessary changes to do so in this
commit (it requires some changes in rendezvous and NCCL code). A fix
for this is forthcoming.

Reviewed By: andrewwdye

Differential Revision: D4661635

fbshipit-source-id: c069e0d4e8f676a63efd74b15ea1156adcc09477
2017-03-06 22:19:49 -08:00
2b1cd919ce Update extending.rst (#933) 2017-03-06 23:23:14 -05:00
8e46a15605 add docs for set_printoptions to sphinx (#945) 2017-03-06 21:52:37 -05:00
15a9fbdedb Merge pull request #881 from colesbury/parallelize_backwards
Parallelize autograd backwards
2017-03-06 16:57:19 -05:00
6336300880 Fix bug where adding a hook could replace an existing hook.
We were keying hooks by RemovableHandle id. However, we don't hold onto
handles and ids of dead objects can be reused. This replaces id(handle)
with a global counter.
2017-03-06 12:47:53 -08:00
5073132837 Implement 'pre' and 'post' hooks at the C++ autograd level 2017-03-06 12:47:53 -08:00
65b66264d4 Improve broadcast/reduce performance by coalescing tensors 2017-03-06 12:47:53 -08:00
0f872ed02f Add THCCachingAllocator_recordStream()
This is similar to THCCachingHostAllocator_recordEvent() but on CUDA
allocations. It's useful for overlapping copies with computation. The
workflow is approximately:

  0. allocate dst tensor on copy stream
  1. copy from CPU to GPU on copy stream
  2. synchronize the main stream with the copy stream via
     cudaStreamWaitEvent
  3. THCCachingAllocator_recordStream(dst, main_stream)

The recordStream() call is necessary to prevent the dst tensor from
begin reused on the copy stream before the main stream finishes work.

Previously, you would need to insert a second cudaStreamWaitEvent before
dst is freed to force the copy stream to wait on the main stream.
2017-03-06 10:50:19 -08:00
761d6799be code syntax error in document (serialization.rst) (#937) 2017-03-06 10:06:04 -05:00
0d179aa8db Updated datasets.rst, combined all commits (#931)
Added MNIST in the docs

Updated incomplete cifar doc

Updated the datasets.rst to include all datasets
2017-03-05 17:38:28 -05:00
5b171ad7c2 remove misleading guide for BCELoss (#924) 2017-03-05 14:31:01 -05:00
ac9245aeb3 import numpy before setting dlopen flags (#928) 2017-03-05 14:30:13 -05:00
60736bdf99 fix corner case in kwargs for DataParallel (#930) 2017-03-05 14:27:52 -05:00
7d58765cee docs: Fixed example code bug in extending module doc. 2017-03-05 12:09:08 -05:00
76f7d749e4 bump version 2017-03-05 08:49:52 -08:00
0b7374eb44 add THCS to build_all flags 2017-03-05 11:32:43 -05:00
6fff764155 replace old select_compute_arch.cmake with new 2017-03-05 11:32:43 -05:00
8ced72ccb8 link THPP to THCS when CUDA available 2017-03-05 11:32:43 -05:00
b1ae7f90d5 Added functionality for data parallel table (#843) 2017-03-05 02:35:46 +01:00
8b61ee522e Merge commit 'aec182ae72d51dad0f46cdfe7ff9a41380d7da35' 2017-03-04 08:58:21 -08:00
76ca3eb191 Merge commit 'fea50a51ee2d9af15c42f785ab2232469357b557' 2017-03-04 08:58:02 -08:00
fea50a51ee reintroduce USE_AVX* for files which dont have -mavx* set 2017-03-04 08:55:43 -08:00
51e589ed73 fix critical bug in adds SSE implementation 2017-03-04 08:39:19 -08:00
2e87643761 remove fastmath for everything except simd/convolve 2017-03-04 08:16:47 -08:00
ba9a85f271 fix bug introduced in #952 2017-03-03 21:00:05 -08:00
a22fd7194e More assertions for state change in TCP transport
Summary:
I have seen a stress run crash with unexpected state. Adding these
assertions will give more information when it happens again.

```
terminate called after throwing an instance of 'gloo::EnforceNotMet'
  what():  [enforce fail at gloo/transport/tcp/pair.cc:407] false. Unexpected state: 5
```

Reviewed By: andrewwdye

Differential Revision: D4652216

fbshipit-source-id: e787f4097f5ab32367dd9fa5a336d0389b97e955
2017-03-03 14:20:07 -08:00
0714d7a3ca set AVX/AVX2 flags only for specific files 2017-03-03 12:17:14 -08:00
fb7bafdd0f Update README.md
Summary:
Fix styling in README
Closes https://github.com/facebookincubator/gloo/pull/4

Differential Revision: D4651501

Pulled By: pietern

fbshipit-source-id: e2d4384ac94972f6c4fc03467564460ea4ce5c85
2017-03-03 11:40:02 -08:00
34ce58c909 Parallelize backwards 2017-03-03 11:26:00 -08:00
c238ee3681 Fix issues with lazy grad initialization (#912) 2017-03-03 14:23:51 -05:00
e1d7eaf7d8 Latency optimization tips
Summary: Closes https://github.com/facebookincubator/gloo/pull/3

Differential Revision: D4651203

Pulled By: pietern

fbshipit-source-id: 202afcbe26ec77ea93e48e72fea0d36f18b1b026
2017-03-03 11:05:17 -08:00
f5338a1fb8 compile AVX and AVX2 intrinsic code in separate files. Cleanup use of USE_AVX and USE_AVX2 macros in favor of __AVX__ and __AVX2__ 2017-03-03 10:30:18 -08:00
d96ad41191 cleanup TH CMakeLists and THGeneral.h of unused flags 2017-03-03 09:48:26 -08:00
f17cfe4293 sparse tensor operations (#735) 2017-03-03 18:37:03 +01:00
aec182ae72 Support half precision in baddbmm 2017-03-03 16:15:39 +01:00
c93c884ee2 Add negative dimension to transpose and tests (#792) 2017-03-03 09:31:22 -05:00
c42a2d4d24 Fix dimension check for cat (#959)
* Use TH_INDEX_BASE when verifying dimension for cat

* Adding tests for cat when no dimension is specified.

- Also renamed ldimension to cat_dimension to be more specific.
2017-03-03 09:05:06 -05:00
f89252c336 Merge pull request #719 from twitter-forks/cat-fix
Fixes to cat
2017-03-03 09:04:06 -05:00
490c15fae9 Fix slicing with step (#905) 2017-03-03 09:00:14 -05:00
7e3b572ca7 Document algorithm semantics
Summary: TSIA

Reviewed By: andrewwdye

Differential Revision: D4647587

fbshipit-source-id: a804e7479e6e2f511bfa59712b4b4a88bdf657e3
2017-03-02 21:35:28 -08:00
5fbcd88102 Rename public member fields on gloo::Context
Summary:
The fields are public so their names should not end with an
underscore.

Reviewed By: andrewwdye

Differential Revision: D4645038

fbshipit-source-id: c12b47affbe511383a4722717a06abb61918473b
2017-03-02 19:49:45 -08:00
f2d72ba10f Revert "make handles to be thread-local"
This reverts commit 0720ba53b344809ce3d0bdfb1ea561afa5fe0646.
2017-03-02 17:48:24 -08:00
2108b42b92 Fix bug in cat when dimension is not specified.
- Code was using dimension specified which was negative
- Changed the cat_dimension variable to be more explicit
- Fixed code to use the cat_dimension variable
2017-03-02 16:14:09 -08:00
bae8df62d3 Add missing THCudaCheck around cudaMemcpy 2017-03-02 16:13:39 -08:00
a2b2880cc2 Remove underscores from public fields in NCCLContext
Summary: Remove underscores from public fields in NCCLContext

Reviewed By: pietern

Differential Revision: D4645857

fbshipit-source-id: 2c28a1c23d31097d685c0768dad9b99bbef7b171
2017-03-02 16:05:15 -08:00
70fc15c05c More documentation
Summary: TSIA

Reviewed By: andrewwdye

Differential Revision: D4644734

fbshipit-source-id: 50f5fadd2c5cd04e06a025f5538187ed852e669a
2017-03-02 15:50:37 -08:00
98775b6bb4 Merge pull request #718 from killeent/templatize-scan
genericize PrefixSum --> PrefixScan via binary operator template parameter
2017-03-02 17:50:56 -05:00
b7cc2a501f genericize PrefixSum --> prefixScan 2017-03-02 14:31:27 -08:00
0720ba53b3 make handles to be thread-local 2017-03-02 11:10:49 -08:00
ff5fa11129 make mkl link to threaded version with GCC (#958) 2017-03-02 13:37:25 -05:00
837023bb4f Change benchmarks to support multiple input buffers
Summary:
The NCCL code used in CUDA-aware allreduce does local reduction of N
buffers prior to putting anything on the wire. Supporting this in the
benchmark tool to measure the impact under various configurations.

Other minor tweaks in this change:
* Specify sub-second iteration time
* Templatize allreduce benchmarks (the algorithms share a constructor
  prototype)

Reviewed By: andrewwdye

Differential Revision: D4639517

fbshipit-source-id: f7417d3e9f79278a3b1eca48d779f48b77e5260c
2017-03-02 10:16:39 -08:00
e88d241757 Cuda algorithms should return asynchronously if device streams are passed in
Summary: Cuda algorithms take an optional set of device streams to sequence operations. If streams are provided, the algorithms should enqueue final output buffer operations on the associated stream and return asynchronously. Destructors that allocate streams/events should synchronize before tearing down.

Reviewed By: pietern

Differential Revision: D4636447

fbshipit-source-id: 32ec2adc214c83b0b4bc0fff8993ab196459117b
2017-03-02 10:16:38 -08:00
ecb37e4439 Update tests to cover potential reordering problems
Summary:
With this change, every buffer gets assigned a different
value at every index. This means reordering of segments (e.g. in the
chunked algorithm) would surface as test errors.

Reviewed By: andrewwdye

Differential Revision: D4636368

fbshipit-source-id: 464eb1515d1590e12481961d427a92e2ebb3be82
2017-03-02 10:16:38 -08:00
0c88194807 CUDA documentation
Summary: CUDA documentation detailing high-level support for CUDA in gloo algorithms, usage of streams, and synchronizing memory management.

Reviewed By: pietern

Differential Revision: D4633120

fbshipit-source-id: d88e230c8dc82fe48cda0f401b61758fa4f07f2e
2017-03-02 10:16:38 -08:00
50e73a8313 Support synchronous mode in ibverbs transport
Summary:
Synchronous mode means using the calling thread instead of the device
thread for completion handling. Since this saves a context switch in
the critical path, this is very beneficial for low latency algorithms.

For example: the p99 of a 4-way barrier drops from 17us to 4us.

Reviewed By: andrewwdye

Differential Revision: D4626948

fbshipit-source-id: 013b1680497589fe5ad0bca38600bce6a410200b
2017-03-02 10:16:38 -08:00
fc7f026980 Refactor ibverbs transport to prepare for sync mode
Summary:
All pairs created by a device would use the same completion queue.
Supporting sync mode that way is difficult, as there is no way to
filter completions for a particular pair. This change refactors this
to use a single completion queue per pair so that this is no longer an
issue. This change is a preparation for supporting synchronous mode
(where the calling thread itself will poll the ibv library for
completions instead of the device thread).

This change also includes a refactoring of the way transient memory
regions are handled so that they are properly deregistered and
deallocated when no longer needed.

Reviewed By: andrewwdye

Differential Revision: D4625146

fbshipit-source-id: 21bf5ab321534fbd5c03f12049c10fc67da68944
2017-03-02 10:16:38 -08:00
9f18f83375 Downcase setMutex
Summary: TSIA

Reviewed By: andrewwdye

Differential Revision: D4626965

fbshipit-source-id: 2d32b07182202f65e673795aefacc6cc991d3c7c
2017-03-02 10:16:38 -08:00
9c114e6f1c Fix compile error
Summary: std::atomic was not defined for cuda.cu.

Reviewed By: andrewwdye

Differential Revision: D4624611

fbshipit-source-id: 973bba10026e065667d6a576055d00505ee02d62
2017-03-02 10:16:38 -08:00
0e78a59610 add mutex getter/setter to synchronize CUDA and NCCL ops
Summary: Allow gloo consumers to assign a mutex to synchronize CUDA malloc/free and NCCL operations.

Reviewed By: pietern

Differential Revision: D4622135

fbshipit-source-id: 60acd7c01a677a0df5415fe38e6ef5a2e7c8606a
2017-03-02 10:16:38 -08:00
5e7f5db332 add subset samplers (#888) 2017-03-02 09:26:10 -05:00
b5f7592140 boolean mode in module.train 2017-03-02 09:18:05 -05:00
f366e5fc81 Support int16 numpy conversions
issue #891
2017-03-02 09:15:57 -05:00
48f087f6ce C99 cleanup broke MSVC (#952)
* __pragma for MSVC.
2017-03-02 08:57:28 -05:00
7fef264bfa Bumping version to 1.3.3 2017-03-01 16:44:27 -08:00
8996811936 Only enable peer access for ring neighbors.
This enables support for systems with more than 9 GPUs attached to a single PCIe root complex.
2017-03-01 16:42:38 -08:00
c219a183d0 Fix copy/paste typo in error message 2017-03-01 16:42:38 -08:00
8e1d6f9b60 Fix crash in Reduce when non-root ranks have invalid recvbuff 2017-03-01 16:42:38 -08:00
7ad948ffa9 fix tests to not sys.exit(), also fix fatal error on THC initialization 2017-03-01 17:37:04 -05:00
3277d83648 Add Nesterov Momentum (#887) 2017-03-01 20:49:59 +01:00
1487278fdf Allow backprop through cuDNN RNN in eval mode
Handling of dropout descriptors has been improved too.
2017-03-01 19:42:39 +01:00
977630bc15 Handle duplicate backward roots in autograd 2017-03-01 19:42:39 +01:00
12efd53dba ConstantPad2d and F.pad (#856) 2017-03-01 19:39:44 +01:00
37e05485d9 added initialization schemes in torch.nn.init (#833) 2017-03-01 19:34:13 +01:00
c76770f40e Merge commit 'dfca8dfdc5988813ed5673589ffa4fdd1c4f3d2d' 2017-03-01 09:29:51 -08:00
da725830c2 Add support for variable length sequences in RNNs (#873) 2017-03-01 17:36:32 +01:00
fc6fcf23f7 Lock the cudaFree mutex. (#880)
Prevents NCCL calls from overlapping with cudaFree() which can lead to
deadlocks.
2017-03-01 11:29:25 -05:00
b190f1b5bc Add another pinned memory test.
Checks that pinned memory freed on a different GPU from which it was
allocated isn't re-used too soon.
2017-03-01 12:22:31 +01:00
dfca8dfdc5 ensure valid index in multinomial 2017-02-28 14:48:48 -08:00
b46d5e0b04 Fix NN bindings 2017-02-28 14:35:38 -08:00
f19a11a306 Merge commit '8e8022b7351401911e10b94aeb5ae35d32907705' 2017-02-28 14:35:20 -08:00
cfcf69703f Merge commit '80429ad9f7c4775f7f88344a2cf037e499f060b8' 2017-02-28 14:35:00 -08:00
e22b8e0d17 Merge commit '3cc89afde68a831434f3abe9e3af2ac0b134215e' 2017-02-28 14:34:44 -08:00
fbfba6bdca Merge commit '6ff77503645da59eeca5be473a1902e523c4adb3' 2017-02-28 14:34:29 -08:00
3cc89afde6 Merge pull request #713 from killeent/multinomial-indexing-fix
fix indexing bug in sampleMultinomialOnce
2017-02-28 17:13:44 -05:00
1e4aee057c Merge pull request #712 from killeent/multinomial-fixes
Fix sampleMultinomialOnce to better handle large distribution values
2017-02-28 17:12:48 -05:00
8dfcf7e35a Merge pull request #709 from colesbury/pinned_memory
Fix bug where pinned memory event could be recorded on incorrect device
2017-02-28 16:56:21 -05:00
76de151ddd Fix bug where pinned memory event could be recorded on incorrect device 2017-02-28 13:48:56 -08:00
2676cc46c2 fix indexing bug in sampleMultinomialOnce 2017-02-28 13:40:15 -08:00
1bf7bc9768 refactor sampleMultinomialOnce to use <real, accreal>, assertion for sum overflow 2017-02-28 12:46:12 -08:00
3c41c9fe46 Add AutoGPU RAII that doesn't depend on Python API (#875)
Separates out non-Python part of AutoGPU. This also compiles without
CUDA which is useful for generic tensor code.

Also fixes a bug where THCPAutoGPU may not always switch the device:

  THCPAutoGPU guard(-1);
  guard.setDevice(0);
  guard.setDevice(1);
  guard.setDevice(0);  // would not switch batch to 0
2017-02-28 14:39:20 -05:00
6ff7750364 add TH_TENSOR_APPLY variants for optimized redux (+refactor) 2017-02-28 10:30:31 -08:00
4d25c3d048 address comments and add tests 2017-02-28 10:23:36 -08:00
267b7ade50 Speed up reductions on non-contiguous dimensions 2017-02-28 10:23:36 -08:00
80429ad9f7 THVector_(add) -> THVector_(adds) 2017-02-28 12:20:44 -05:00
5ca6516ecb THVector_(add),(mul),(div) -> (adds),(muls),(divs) 2017-02-28 12:10:47 -05:00
67f94557ff Expose torch.HalfTensor 2017-02-27 19:35:47 -05:00
61bd5a0643 [Lint] Address F811 2017-02-27 19:33:00 -05:00
748d011c8b [Lint] Address F812 2017-02-27 19:33:00 -05:00
5d5cfe2e57 [Lint] Address E731 2017-02-27 19:33:00 -05:00
7cbe255296 [Lint] Use flake8 instead of pep8 2017-02-27 19:33:00 -05:00
4ef303698c Merge pull request #711 from gchanan/getDeviceAllocator
Add getter for cuda device allocator.
2017-02-27 19:29:39 -05:00
83e8b3f6c3 Add getter for cuda device allocator. 2017-02-27 15:44:44 -08:00
502ebed796 Fix one more reference cycle and ensure correct flag propagation (#868) 2017-02-27 18:38:29 -05:00
68ff58d771 Expose a mutex that is held around cudaFree() calls.
NCCL can deadlock if cudaFree() is called while it's launching kernels.
This exposes a mutex that can be held to prevent cudaFree() calls in the
caching allocator.
2017-02-27 15:08:30 -08:00
969c1602e6 Add Tensor::copy() to THPP
For now, this only supports copying from the same type. We can add
polymorphic copying in the future.
2017-02-27 21:33:40 +01:00
2d4d3b18dd Use NCCL operations in AllreduceChunked
Summary: The AllReduceChunked algorithm currently performs the local reduce/broadcast of local device buffers in host memory. This diff updates the algorithm to execute the local reduce/broadcast steps using NCCL operations before copying a single device buffer to/from host memory.

Reviewed By: pietern

Differential Revision: D4587441

fbshipit-source-id: 4de689f59a6cf898b8eecd3c3b9f57f77124c0e3
2017-02-27 09:59:29 -08:00
5e1d6a3691 Update functional.py (#862)
Fixed documentation error in conv3d
2017-02-27 10:42:02 -05:00
533cfc0381 Minor fix of docs of ModuleList and ParameterList (#861) 2017-02-27 10:09:54 +01:00
2b23712dc3 Improve autograd memory usage (#859) 2017-02-26 22:37:26 -05:00
88275da5e8 CUDA documentation tweaks (#858) 2017-02-26 20:37:43 +01:00
bd7a5ad6f0 Make Optimizer.load_state_dict use __setstate__ 2017-02-26 20:02:42 +01:00
1f6f82dbcf Fall back to indexing compatible with numpy 2017-02-26 20:02:42 +01:00
1f8939937a Allow using expand to broadcast tensors 2017-02-26 20:02:42 +01:00
b3d41a5f96 Add docs for ModuleList and ParameterList 2017-02-26 20:02:42 +01:00
fec2d493a9 Reshape grad_output in basic ops 2017-02-26 20:02:42 +01:00
86ee75f63f Fix for Long and Byte tensor indexing of Variables 2017-02-26 20:02:42 +01:00
31941918cf Prevent creation of reference cycles with leaf Variables that don't require grad
Also, raise an error immediately, if a leaf that requiers_grad is
modified in-place. Some comments were updated too.
2017-02-26 20:02:42 +01:00
19a65d2bea Expose stateless methods for torch.cuda.HalfTensor 2017-02-26 20:02:42 +01:00
819d4b2b83 Add finite differences gradcheck (#851) 2017-02-26 08:35:24 -05:00
b87c113cf4 CUDA documentation enhancement and docs versioning (#848)
* Add more detail to CUDA documentation

Also adds better cross-linking to the pages that discuss relevant topics.

* Adds recommendation to torch.save docs

* Make the version numbers for the docs dynamic

Might need tweaks for beta, 1.0, etc.
2017-02-26 08:33:26 -05:00
b25182971f readme change for getting clarity on binaries 2017-02-26 07:52:13 -05:00
1ee2c47e37 Correcting the description of LSTM attributes (#854) 2017-02-26 13:30:55 +01:00
2dc563f1f1 Fix indexing when passing only an Ellipsis 2017-02-25 23:34:09 +01:00
15ba71a275 Rebase fixes 2017-02-25 17:14:52 +01:00
e5b3fc49d6 Implementation of the 3rd set of tensor functions 2017-02-25 17:14:52 +01:00
ae1766951d Link TH and THPP to THD (#57)
* Fix THD library build

* THPP dependency added

* Minor cleanup; Fix build on OSX
2017-02-25 17:14:52 +01:00
02d08dafd9 Add support for IPv6 in Data Channel TCP (#53) 2017-02-25 17:14:52 +01:00
13a5090695 Added a size change in MaxPool1d module and improved tests (#771) (#832)
Backend is SpatialDilatedMaxPooling, so change 3D input (N*C*L)
to 4D size (N*C*1*L). Then output indices will range from 0 to L.
This range will not cause UnMaxPool1D error.

Signed-off-by: Zhou Chang <achang.zhou@gmail.com>
2017-02-25 08:53:30 -05:00
8e32e4c04c make wrap_generic_function importable 2017-02-24 14:27:54 -08:00
cf991310c3 c++ virtual function fix 2017-02-24 13:22:44 -08:00
938706099e adding environment flags to disable SIMD codepaths 2017-02-24 07:35:11 -05:00
3330287dc7 Update dataloader.py (#837) 2017-02-23 14:38:41 -05:00
38c8520adf adding unsqueeze to docs 2017-02-23 12:13:25 -05:00
492e1746af Fix THFree in THTensorApply 2017-02-23 06:01:13 -05:00
91a8109cfd Use C99 for openmp cleanup 2017-02-23 06:01:13 -05:00
161490d34a Add memcpy copy 2017-02-23 06:01:13 -05:00
9c302852eb comments fix 2017-02-23 06:01:13 -05:00
8654fcfd60 THVectorDefault style fix 2017-02-23 06:01:13 -05:00
b3d527d9a0 Tab style fix 2017-02-23 06:01:13 -05:00
4d495218c9 THTensorApply3 contiguous optimizations 2017-02-23 06:01:13 -05:00
13a041284c THTensorApply2 copy optimization 2017-02-23 06:01:13 -05:00
c60c1a003d TH_TENSOR_APPLY2 contiguous optimization 2017-02-23 06:01:13 -05:00
97add1a5ea comment fix 2017-02-23 06:01:13 -05:00
ca02930e47 Fill bug fix 2017-02-23 06:01:13 -05:00
20d5e95077 THTensorApply3 compress counter 2017-02-23 06:01:13 -05:00
eb4a7dc11d THTensorApply change dims to sizes 2017-02-23 06:01:13 -05:00
f722498b72 THTensorApply2 counter compress 2017-02-23 06:01:13 -05:00
aadfb6fe83 THTensorApply reduce memory overhead 2017-02-23 06:01:13 -05:00
6c273594c9 THTensorApply Counter compress 2017-02-23 06:01:13 -05:00
e475c82fa1 Add isTransposed judge and enable multithread of fill functions 2017-02-23 06:01:09 -05:00
0c2e6665df Add AVX copy 2017-02-23 05:50:34 -05:00
6295e6e94b Rebase master 2017-02-23 05:50:34 -05:00
670a4aa708 Fix AVX2 bugs 2017-02-23 05:50:34 -05:00
1bdc2e64ed Add fma cadd 2017-02-23 05:50:34 -05:00
c587be1e50 Add THVector Fill 2017-02-23 05:50:34 -05:00
bd481596f5 optimize THVector add mul div 2017-02-23 05:50:34 -05:00
a504d56b43 Fix THVector cmul AVX bug 2017-02-23 05:50:30 -05:00
91c4dfccea Use THVector cadd AVX 2017-02-23 05:46:44 -05:00
27f618c44d Add THVector Fill AVX 2017-02-23 05:46:44 -05:00
a14482a1df Add THVector cadd AVX 2017-02-23 05:46:40 -05:00
aa50c5734b Add THVector AVX cmul 2017-02-23 05:46:07 -05:00
293001a4fe Add THVector SSE div cdiv 2017-02-23 05:46:07 -05:00
638cfdf150 Add SSE add 2017-02-23 05:46:07 -05:00
5f80a14525 Separate SSE and AVX 2017-02-23 05:46:07 -05:00
1342fd3975 Remove THTensorMathSIMD THTensorMathDispatch 2017-02-23 05:46:07 -05:00
8d4af38489 Add THVector div cdiv 2017-02-23 05:46:07 -05:00
575a064e66 Remove THVector diff 2017-02-23 05:46:07 -05:00
3ab21a3c4f Merge THVector mul AVX 2017-02-23 05:46:07 -05:00
2f592e6c7d Remove THVector scale 2017-02-23 05:46:07 -05:00
5661ffb766 Merge THVector mul 2017-02-23 05:46:03 -05:00
9b74503daa Merge THVector cmul 2017-02-23 05:40:33 -05:00
24848f1cd8 Change THVector mul to cmul 2017-02-23 05:40:33 -05:00
a31a07ede9 Merge THVector add 2017-02-23 05:40:33 -05:00
c8c4c9b23d Change THVector add to cadd and fix NEON 2017-02-23 05:40:33 -05:00
e1ed9303f0 Add multi-thread add 2017-02-23 05:40:33 -05:00
a43aab13c2 Fix THTensorMath.c style 2017-02-23 05:40:33 -05:00
c698b4a45e Add Dispaches for div and mul 2017-02-23 05:40:29 -05:00
c6a0ffab50 Add AVX single float and double float add 2017-02-23 05:40:24 -05:00
8ba7cc30d1 Add THTensorMathSIMD.c 2017-02-23 05:32:34 -05:00
61bf08ca24 Fix compilation for simd tensor add 2017-02-23 05:32:28 -05:00
6ada3c0c16 Fast floating point add kernel in intrinsics (11x speedup over default for 10k elements) 2017-02-23 05:11:44 -05:00
60061fbe79 Fixed up CPU dispatch and tested. Can begin implementing kernels 2017-02-23 05:11:44 -05:00
46e7042add SIMD helper header, modified add in THTensorMath to check dispatch 2017-02-23 05:11:44 -05:00
d0c182773b First commit for dynamic CPU dispatch: general framework in place (need to create dispatch tables and stubs for all functions and make impls have hidden linkage) 2017-02-23 05:11:44 -05:00
b6f60585b5 fix AVX2 detection bugs 2017-02-23 05:00:55 -05:00
4b0e3ee219 Merge pull request #699 from twitter-forks/bitops
Bitwise operations
2017-02-23 04:15:35 -05:00
838842d4b2 fix documentation error. [issue #790](https://github.com/pytorch/pytorch/issues/790) (#831) 2017-02-23 08:59:29 +01:00
e71cf20192 improved serialization (no tar copy) (#713) 2017-02-22 22:24:20 +01:00
adb4cb2b5b contiguous view backward (#816) 2017-02-21 19:09:36 -05:00
478d7446ef CMake fixes
Summary: Adds script to populate third-party directory.

Differential Revision: D4591509

fbshipit-source-id: 28934feb536a9f3a066d8c40988337f3dddffaed
2017-02-21 15:06:45 -08:00
df68230351 README and docs skeleton
Summary: TSIA

Differential Revision: D4591755

fbshipit-source-id: fa435f4ad6b97453c3c9516b4bfc9f8f0fb2e4f1
2017-02-21 10:52:04 -08:00
6073f9b46c update table in README.md
it removes the empty top row
2017-02-21 12:58:04 -05:00
8e8022b735 Merge pull request #418 from ruotianluo/adaptiveAverage
Add SpatialAdaptiveAveragePooling.
2017-02-21 09:15:12 -05:00
da82d2dd70 Merge pull request #434 from bottler/master
VolumetricFractionalMaxPooling like spatial
2017-02-21 09:13:59 -05:00
82176473a5 Merge pull request #442 from twitter-forks/half-fixes
Convert real to accreal in libTHCUNN
2017-02-21 09:12:56 -05:00
2d269a9a72 Merge pull request #1137 from twitter-forks/half-fixes
Using accreal instead of real in the API
2017-02-21 09:12:32 -05:00
240372a991 Fixed topk documentation for largest=True 2017-02-21 04:38:24 -05:00
5b10411c8c Fixed some mistakes in examples
Fixed mistakes in LSTMCell and GRUCell examples.
2017-02-21 04:17:28 -05:00
4c474a9939 Improve prodall CUDA test 2017-02-20 23:28:31 -08:00
7ea6ae57c8 Support numpy arrays in default_collate 2017-02-20 23:28:31 -08:00
42633f8986 Fix misspelling and add support for weights in NLLLoss2d 2017-02-20 23:28:31 -08:00
84248690a9 Add support for indexing with None and slices with positive steps 2017-02-20 23:28:31 -08:00
53409ca0fb Fix a warning in THPP 2017-02-20 23:28:31 -08:00
c2c1710047 Add clip_grad_norm 2017-02-20 23:28:31 -08:00
876202503f Support multiple inputs in data parallel 2017-02-20 23:28:31 -08:00
946a7d9bc3 Make input contiguous only once in backward of cuDNN RNN 2017-02-20 23:28:31 -08:00
608bcd3b15 Return correct number of gradients from cuDNN RNN 2017-02-20 23:28:31 -08:00
632b02a477 Add checks for reward type and size in StochasticFunction 2017-02-20 23:28:31 -08:00
0db9c63300 Use library_dirs in setup.py 2017-02-20 23:28:31 -08:00
873ed4e6b6 Add better error message for conversion of CUDA tensors to numpy 2017-02-20 23:28:31 -08:00
01bd43037d add docs to torch/cuda/random 2017-02-20 20:43:47 -05:00
68c9e3f232 Fixed typo in GRUCell example 2017-02-21 01:37:04 +01:00
a25c8555eb Fixed paper references 2017-02-21 00:27:18 +01:00
d6ca3820aa Optionally specify stream for pointers in CUDA algorithms
Summary:
Work may be queued on CUDA streams for asynchronous execution. The
memory backed by pointers passed to any algorithm can therefore be
mutated after constructing an algorithm instance. By also passing in
the streams these mutations happen on, the algorithms can synchronize
with these mutations to ensure no invalid data is used.

By passing in these streams, any work done by these algorithms will
*also* be queued, which effectively removes a single synchronization
step from any algorithm run.

Differential Revision: D4589394

fbshipit-source-id: 0c8cd6ba9c9018f33d6f4c55a037083fc4164acb
2017-02-20 14:15:53 -08:00
dfd1dff383 Merge commit '4ca26fbc1b7be4e369f84e95df16431bb2f1dcb7' 2017-02-20 08:05:19 -08:00
8f391d4d51 Merge commit 'ee43cd7adca3b24a2071ce6c55dcd3a95a2b6ff6' 2017-02-20 07:55:46 -08:00
2a6b7685ae Merge commit 'f6c1bbfa483ad19c500dc94838baaa69f02d240b' 2017-02-20 07:55:19 -08:00
eb9573107d Merge commit '34b7fed802db1fda6322a70b648dcc4947858719' 2017-02-20 07:54:51 -08:00
ee43cd7adc Do SpatialClassNLLCriterion sizeAverage in a separate kernel 2017-02-20 06:54:23 -08:00
4ca26fbc1b Remove averaging from prodall 2017-02-20 11:37:53 +01:00
c165226325 Print a readable error message when arguments are on different GPUs 2017-02-20 11:35:50 +01:00
0722775ca3 AllreduceRingChunked/CudaAllReduceTest should use the chunked algorithm
Summary: I was mistakenly calling the non-chunked algorithm for the chunked test.

Reviewed By: pietern

Differential Revision: D4580160

fbshipit-source-id: 9d62a68e9e86cc6e596d90ff8854c585a0e8855c
2017-02-17 19:17:44 -08:00
49295ebe54 Add sequential to documentation 2017-02-18 08:42:43 +05:30
455038e470 Use a more stable formula for spatial LogSoftMax 2017-02-17 13:05:45 -08:00
ca7f02ea0c Add shape checks for SpatialClassNLLCriterion 2017-02-17 13:01:56 -08:00
04aba1caec Fix cuDNN dropout desc for multi-gpu (#772) 2017-02-17 19:16:12 +01:00
420488349f Implement CUDA-aware allreduce chunked
Summary:
First pass at a CUDA-aware allreduce chunked implementation. For now the algorithm runs on the CPU and is mostly copy/paste from allreduce_ring.h. A subsequent pass will offload to the GPU.

Serialize cuda test to avoid intermittent failures due to memory contention.

Reviewed By: pietern

Differential Revision: D4576959

fbshipit-source-id: e1f292a05b88ff24c33e549d4a52e770a21f85d2
2017-02-17 09:06:05 -08:00
f6c1bbfa48 Merge pull request #1105 from ruotianluo/adaptiveAvg
Add SpatialAdaptiveAveragePooling
2017-02-17 10:52:33 -05:00
4e2c8c6db5 Merge pull request #1123 from bottler/master
VolumetricFractionalMaxPooling like Spatial...
2017-02-17 10:42:21 -05:00
1a5cae7340 Add busy-poll option in TCP transport
Summary: Ideally we would want the driver to busy-poll for us. In absence of driver support, spinning with MSG_DONTWAIT flag seems to be helping a lot too. Of course, we pay the price of burning one core for polling. Sigh.

Reviewed By: pietern

Differential Revision: D4576242

fbshipit-source-id: 85d9e1b786fbb6053864fba80f3e5ecc80fe221d
2017-02-17 07:31:32 -08:00
c26b9c0a5e Update rnn.py
Based on the https://github.com/pytorch/pytorch/blob/master/torch/backends/cudnn/rnn.py#L302 line, the output is returned in a (0,1) transposed version, if the batch_first argument is set to true.
2017-02-17 14:37:14 +01:00
aaf41c61a6 Fix Engine::compute_dependencies 2017-02-17 18:28:51 +05:30
dd844f741b Fix previous_functions when it contains Variables 2017-02-17 11:03:46 +05:30
4dd19988c3 Add benchmark option to display nanoseconds
Summary:
Latency optimization is going well and I've seen the odd case of <10us
measurements. This option makes the benchmark tool display nanos
instead.

Differential Revision: D4575925

fbshipit-source-id: 98dbd3b39e31cbcdd4c146613f6630e721187e1e
2017-02-16 21:16:26 -08:00
7117a9012e Fix flaky non-contig test 2017-02-17 10:40:08 +05:30
1bdc28161a Add torch.__version__ 2017-02-17 10:40:08 +05:30
5e150caf38 Fix a bug in Engine::compute_dependencies 2017-02-17 10:40:08 +05:30
c0c62d099a Make detach() actually remove the creator 2017-02-17 10:40:08 +05:30
b9ece39685 Make torch.Size methods return torch.Size, not tuple 2017-02-17 10:40:08 +05:30
15ef008877 Using accreal instead of real in the API
- This reverts commit 7a07afe545b4deae5919d9dc268bfac3d37398c7.
- Includes fixes for TemporalRowConvolution
2017-02-16 17:34:11 -08:00
b14d6318f8 Convert real to accreal in libTHCUNN
- This reverts commit 0d85922d116879448485ef88ae21e83a9255a0b0.
- Includes fixes for TemporalRowConvolution
2017-02-16 17:33:03 -08:00
93002720eb Extract CudaDevicePointer for reuse across CUDA-aware algorithms
Summary:
The CudaDevicePointer optionally takes an existing stream on
which it runs any operation associated with the pointer (for now just
memcpy's, but this likely will includes kernel execution in the
future).

Differential Revision: D4574035

fbshipit-source-id: ddd7972a3874012059f1fde1b341fd6edd69102d
2017-02-16 14:05:52 -08:00
7c44506441 allow DataParallel to have tuple inputs on a single GPU 2017-02-16 19:07:17 +01:00
937ba581d7 Improve nn.legacy compatibility with Torch7 (#738) 2017-02-16 21:17:12 +05:30
2ae54f1194 setup.cfg -> tox.ini (#761) 2017-02-16 21:13:13 +05:30
cb91078e01 Support synchronous mode for TCP transport
Summary:
In synchronous mode, it is not the device thread that is responsible
for handling I/O, but the user thread itself. Calling waitRecv on a
buffer will trigger the read function on the pair to be called. This
eliminates the context switch necessary if the device thread is
handling all I/O. For benchmarks with small numbers of elements this
reduces latency by as much as 20%.

Reviewed By: plapukhov

Differential Revision: D4549998

fbshipit-source-id: ab718ba090c06d7c7aa4065cc9f92bd96b9e4a35
2017-02-15 17:31:06 -08:00
a217fefee1 Update rnn.py
Fixed a problem with outputting the RuntimeError if arguments are incorrect in cudnn/rnn.py
2017-02-15 21:49:42 +01:00
34b7fed802 Fix gcc 4.4.7 build. 2017-02-15 09:06:25 -08:00
5221745c21 add test for bias=False for 3d convolution 2017-02-15 04:26:44 -08:00
000ca44b16 Merge commit '797544c47a4e9bdff02137a127f883a6df9b3dfe' 2017-02-15 04:24:14 -08:00
8f3d44033b Merge commit '0426f2f3ec2b932cb83d64101081244c2a1451b1' 2017-02-15 04:23:50 -08:00
7cc14c595a Merge commit '07f5b21ef1bd29d1451c616062dcbfc3f8fd7c6a' 2017-02-15 04:23:18 -08:00
797544c47a implementation of bias=False for VolConv.cu 2017-02-15 04:18:17 -08:00
0426f2f3ec implementation of bias=False for VolConv.c
Used .c file changes from 7318e2de13 as a starting point. All changes to .c files (except for whitespace details) are present here.
However, the required .h files were not present in that PR.
2017-02-15 04:16:09 -08:00
336eeee895 kernel_size as the default stride for avg_pool1d (#744)
Following the documentation, let stride to be kernel_size if stride is not provided.
2017-02-15 13:12:18 +05:30
593f867e3e Fixed a simple compiling erroin mac OS #745. (#746)
Signed-off-by: Zhou Chang <achang.zhou@gmail.com>
2017-02-15 12:19:03 +05:30
385913be1c Fix class torch.nn.ConvTransposeNd documentation (#739)
There is no `dilation`
`output_padding` doc was missing
2017-02-15 10:37:20 +05:30
6aaa14f5fe Fix LSTMCell Doc Typo (#743) 2017-02-15 08:29:17 +05:30
07f5b21ef1 Merge pull request #702 from gchanan/conservativeAllocator
Improve THCCachingHostAllocator performance by making it reclaim less aggressively
2017-02-15 08:26:48 +05:30
ee52f89772 Implement CUDA BroadcastOneToAll algorithm
Summary:
Implement CUDA BroadcastOneToAll algorithm for GPU addresses. Refactor cuda.h into cuda_private.h to allow inclusion of <cuda.h> in public headers without polluting the namespace.

Port broadcast tests to GPU variants.

* this revision is based on Peter's revision D4546932

Differential Revision: D4547382

fbshipit-source-id: 3d294ad8862b04fb783ba22e5c925b8d7cbc8a8d
2017-02-14 18:46:56 -08:00
e454870396 Free set of stored streams and handle NULL streams. 2017-02-14 15:41:47 -08:00
2822013437 Fix flaky tests 2017-02-14 21:28:50 +01:00
72c1982734 Add some more asserts to cuDNN RNN 2017-02-14 21:28:50 +01:00
0de2ea305a Support retain_variables in cuDNN RNN 2017-02-14 21:28:50 +01:00
d899385a3d Raise error when too small input is given to conv 2017-02-14 21:28:50 +01:00
c6d6cbe8a6 Check that all tensors are on the same GPU in cuDNN bindings 2017-02-14 21:28:50 +01:00
85e82e85d8 Fix bug in zero_grad, when some parameters didn't require grad 2017-02-14 21:28:50 +01:00
a1534cc37d Fix auto-gpu in cat 2017-02-14 21:28:50 +01:00
8c8dc791ef Load half and double THCUNN backends 2017-02-14 21:28:50 +01:00
63edca44f2 Add tests for non-contiguous inputs and gradients 2017-02-14 21:28:50 +01:00
6aa8c932fc Benchmark for CUDA-aware algorithms
Summary:
Separate benchmark build target for CUDA-aware algorithms.

This is needed to keep CUDA an optional dependency.

Differential Revision: D4546932

fbshipit-source-id: b73176ae9067233f883d51ba3ab4efbb13a6f86f
2017-02-13 21:32:58 -08:00
8821f4aba6 Fix race in benchmark tool
Summary: TSIA

Reviewed By: plapukhov

Differential Revision: D4549105

fbshipit-source-id: 61c8966e429e0701677f441aeaaf27fdc5e669e7
2017-02-13 21:32:58 -08:00
5e06634f7e Implement initial CUDA-aware allreduce
Summary:
This CUDA-aware ring allreduce is based on the regular ring allreduce.
It runs the reduction algorithm on the CPU and is therefore most
suited for smaller buffers.

Both the device-to-host memcpy's at the start of the algorithm and the
host-to-device memcpy's at the end of the algorithm are kicked off
asynchronously in an attempt to parallize as much as possible.

Reviewed By: Yangqing

Differential Revision: D4542816

fbshipit-source-id: 101dfad276ca79703e37ff93fb1b6d467295f66b
2017-02-13 21:32:58 -08:00
b82c4b3d38 Split benchmark code into multiple files
Summary:
The CUDA benchmark suite will be a separate build target, so the
runner should be reused.

Reviewed By: Yangqing

Differential Revision: D4545092

fbshipit-source-id: 6ccf2d30f5d35c74fc59851b25416bfe6863d62c
2017-02-13 21:32:58 -08:00
8d90ab2d9b compile with cudart (#737) 2017-02-14 06:40:35 +05:30
bd5303010d Refactor autograd package to separate Python dependencies. (#662)
The core autograd Variable, Function, and Engine no longer depend on the
Python API. This let's us implement functions in C++. In the future, we
can also multithread engine and release the GIL for most of the
non-Python backwards.
2017-02-13 16:00:16 -08:00
16d2c3d7b3 make networks converted with loadcaffe loadable 2017-02-13 23:53:46 +01:00
407a92dc26 std::min() requires same type (#732)
* std::min() requires same type

* cast buffer instead

* declare buffer_size as int64_t
2017-02-13 18:06:05 +01:00
0a893abc7b fix serialization bug for large files 2017-02-12 19:13:02 +01:00
34fa5e0dc7 Update docstrings for testing object type
Add docstring for `is_storage()` and `is_tensor()`
2017-02-12 09:21:01 +05:30
712686ce91 Add cat, contiguous, squeeze, and unsqueeze to THPP
Use unsqueeze and view from TH/THC
2017-02-11 17:49:31 +01:00
518864a7e0 Fix bug in legacy NN updateGradParameters (#714) 2017-02-11 11:04:18 +05:30
72fd605b01 Fix std::accumulate
Summary:
Testing pull request again.
Closes https://github.com/facebookincubator/gloo/pull/2

Reviewed By: pietern

Differential Revision: D4542327

Pulled By: Yangqing

fbshipit-source-id: 5bd66c32c7249f1327225117815bef64b8708722
2017-02-10 10:12:37 -08:00
750fb5cc73 Fixes to support short and char tensors for bitwise operations 2017-02-09 18:52:59 -08:00
0f4749907a Adding bitwise operations
- lshift, rshift, bitand, bitor, bitxor
2017-02-09 18:11:58 -08:00
bd2dc63ef6 Adding bitand, bitor and bitxor 2017-02-09 17:06:04 -08:00
19a8795450 Changes to shift operations
- renaming lsh -> lshift, rsh -> rshift
- adding componentwise functions
2017-02-09 15:41:07 -08:00
d9dccfdd71 Fix for non-contiguous grad_output in cuDNN conv 2017-02-10 00:25:59 +01:00
7547a06c4f Avoiding duplicated unsigned as it causes error on gcc. 2017-02-09 13:29:05 -08:00
8929b75795 Added shift operations. 2017-02-09 13:28:36 -08:00
4d37ef878c Remove view on data and target tensors of dim 1 in TensorDataset (#609) 2017-02-09 22:06:39 +01:00
efd8998690 Import gloo
Summary:
In the GitHub repository this directory will be mirrored similar to
folly, such that the repository has a single top level directory
called "gloo". This allows for versioning or renaming of the
project root, without having to mangle the include paths; they will
always use the "gloo" prefix.

fbshipit-source-id: 24502e4185fc7cbe19b5249f83609e2b8118e9d7
2017-02-09 12:33:54 -08:00
126e77d5c6 Merge commit 'e9b05c71b4acf210fad719f4da8bb58a425dd00b' 2017-02-09 12:31:58 -08:00
53eec78bea Merge commit 'ac9312e9f8002227b267a82e224a5a99c7a7e734' 2017-02-09 12:31:40 -08:00
a4edaec81a Merge commit 'aeb7a72620be47c0e6a8928a9cb6df49c06902a0' 2017-02-09 12:31:16 -08:00
92481b59d3 Merge commit '73d232ee454ca25de5552d347a2b06820f30d193' 2017-02-09 12:30:39 -08:00
6c77fa9121 Changes in RNNBase and Embedding for compatibility with DataParallel (#660) 2017-02-09 22:36:26 +05:30
aeb7a72620 Merge pull request #693 from colesbury/view
Add code for 'view' to THC
2017-02-09 12:09:28 +05:30
73d232ee45 Merge pull request #926 from colesbury/view
Add code for 'view' to TH
2017-02-09 12:08:57 +05:30
c0c65bf915 Merge pull request #696 from colesbury/unsqueeze
Add unsqueeze to THC
2017-02-09 11:08:20 +05:30
f6cee952af Merge pull request #929 from colesbury/unsqueeze
Add unsqueeze1d to TH
2017-02-09 11:07:47 +05:30
e74184f679 Make THCCachingHostAllocator less aggressive.
In cases where copyAsync is a large percentage of the work,
processing events in recordEvent can cause a large bottleneck.

Here, we relax the constraint that we reclaim blocks as fast as possible
(i.e. in copyAync); instead, we only check that a block can be re-allocated
in malloc and free.
2017-02-08 14:44:24 -08:00
3884d36176 Add unsqueeze to THC 2017-02-08 13:49:32 -08:00
e7c6886a00 Add unsqueeze1d to TH
Unsqueeze inserts a singleton dimension. Unlike view, it doesn't require
the tensor to be contiguous.
2017-02-08 09:52:50 -08:00
024d1e2678 Merge pull request #69 from cwhipkey/master
Qualify nullptr_t with std::
2017-02-08 09:17:50 -08:00
ed8e92f63d Expose rawSet and rawResize as resizeNd and setStorageNd 2017-02-08 09:00:22 -08:00
fb97df5d65 Expose rawSet and rawResize as resizeNd and setStorageNd
These methods are useful from C because they don't require constructing
THLongStorages to wrap the sizes and strides, which can lead to leaked
memory in case of an error. Instead the sizes and strides can be
represented on the stack using standard C long arrays.
2017-02-08 08:56:04 -08:00
e9b05c71b4 Use THCTensor rather than THCudaTensor in THCUNN.h definition of
GatedLinearUnit.
2017-02-08 07:54:10 -08:00
5eab428294 Qualify nullptr_t with std::. 2017-02-08 07:06:31 -08:00
7926324385 Corrected parameter typo in Adam docstring (#697) 2017-02-07 19:00:10 +01:00
1527b37c26 Fixed typo and rendering of some equations (#693)
* Fixed typo and rendering of some equations

* Few more fixes to MSELoss docs

* Cleaning up whitespace to make pep8 happy
2017-02-07 18:59:27 +01:00
de4659659b The RNNCell's example can not run correctly 2017-02-07 18:58:19 +01:00
a96a8c8336 Static build support + Query CUDA driver, runtime versions (#695) 2017-02-07 08:34:20 +05:30
691aa19b88 Add code for 'view' to THC 2017-02-06 14:04:04 -08:00
6b07dc9e22 Add code for 'view' to TH 2017-02-06 14:00:48 -08:00
8aa259b52b review comments from gchanan 2017-02-06 11:08:23 +00:00
ac9312e9f8 Bugfix/rowconv (#1126) 2017-02-04 20:37:45 +05:30
91a17b702b half<->float conversion cleanup (#901)
* half<->float conversion cleanup
2017-02-04 07:30:13 +05:30
c54597e0b2 std::move fixes 2017-02-03 21:31:03 +01:00
a9785bba44 cuda implementation of Gated Linear Unit, fixed issues with genericization 2017-02-02 21:38:25 -08:00
833b8cbc7a Remove unused code from module 2017-02-02 17:20:11 +01:00
75aeb16e05 Merge commit '72089c9c36c6b880c695baf732cd04329d72c098' 2017-02-01 22:00:42 -08:00
fc354a0d6e Revert "cuda implementation of Gated Linear Unit, fixed issues with genericization" 2017-02-02 10:50:47 +05:30
262611fcd3 Merge pull request #430 from huihuifan/newCudaGLU
cuda implementation of Gated Linear Unit, fixed issues with genericization
2017-02-02 08:16:35 +05:30
b8a34f3033 Small fixups:
1) Add return after THError for completeness.
2) Fix brace formatting
2017-02-01 15:46:19 -08:00
10bb6bb9b8 Fix function names in error messages 2017-02-01 15:21:57 -08:00
3c9ef69c37 Fix THCTensor::isSparse 2017-02-01 14:51:06 -08:00
dee987d6ee use pseudo-fp16 2017-02-01 23:48:09 +01:00
138f254ec1 Support sparse tensors in THPP (#667) 2017-02-01 17:34:50 -05:00
c7c8aaa7f0 Add ModuleList and ParameterList to nn 2017-02-01 23:26:31 +01:00
d0db624e02 Add W503 to PEP8 ignore list (#646) 2017-02-01 15:57:09 -05:00
e3e7b76310 Rename all normal and log_normal args to std 2017-02-01 21:48:11 +01:00
dad02bceb9 Remove duplicated line in cwrap 2017-02-01 21:48:11 +01:00
b195285879 Improve CUDA detection in THPP 2017-02-01 21:48:11 +01:00
8f3da5b51d set_index -> _set_index 2017-02-01 21:48:11 +01:00
825e919eb8 Add torch.unbind 2017-02-01 21:48:11 +01:00
acb0ce8885 Add LongTensor indexing support 2017-02-01 21:48:11 +01:00
72089c9c36 Update THHalf.c 2017-02-01 11:53:29 -08:00
cf2f158fec Remove erroneous proprietary license header
This change was approved by NVIDIA Legal, and I am authorized to make the change on behalf of the company.
2017-02-01 11:43:44 -08:00
41ddc2a786 VolumetricFractionalMaxPooling like Spatial... 2017-02-01 12:01:09 +00:00
e4886f6589 VolumetricFractionalMaxPooling like spatial 2017-02-01 11:52:49 +00:00
6470b5bd21 Add test for Embedding with sparse=True (#663) 2017-02-01 09:54:42 +05:30
tvn
44196955e2 ByteTensor should be unsigned (#664)
ByteTensor should be unsigned
2017-01-31 21:43:39 -05:00
f08ec1394d Fix bug with inplace TH(CU)NN
Also, remove unnecessary zero_() calls
2017-01-31 21:00:49 +01:00
f8fb25e0a2 Add generic bindings to THNN and THCUNN (#645)
Adds bindings using thpp::Tensor to THNN and THCUNN. This allows calling
into those APIs without knowing the concrete types of the tensor
arguments.
2017-01-31 13:23:02 -05:00
6a0c66752f Fix documentation and argument name for Tensor.normal_(mean, stddev) (#652) 2017-01-31 11:55:39 -05:00
a1bd4efb08 readme: add guidance on disabling CUDA (#655) 2017-01-31 14:05:51 +05:30
b43ce05268 Refactor parts of utils.h (#648)
Moves THPObjectPtr into a separate header, so that it can be included
independently. Currently, utils.h requries all of THP.h. Also adds RAII
structs for acquiring and releasing the GIL.
2017-01-30 21:16:28 -05:00
80e56cfda9 Merge commit 'dc9a5b7d2fbcf21268b524b9da5ae38a74214a59' 2017-01-30 17:58:05 -08:00
24701fc5a7 Merge commit '03dcf8a83bb009ecfdd8f27c4d9a6db40829b690' 2017-01-30 17:57:20 -08:00
f78a266d99 Merge commit '368cbe615d0a7bdaadddcb3bd390abcd4cc17b91' 2017-01-30 17:56:37 -08:00
f096fb6859 adding cudnn V6 support (#515) 2017-01-31 02:01:37 +01:00
a3e11d606b Fix linter errors 2017-01-31 01:58:09 +01:00
79232c24e2 Fixes after rebase 2017-01-31 01:58:09 +01:00
15d9d499ab Remove ZMQ dependency from compilation files 2017-01-31 01:58:09 +01:00
962084c8e8 Add Data Channel receive from any source (#52) 2017-01-31 01:58:09 +01:00
7518b1eefb Introduce Scalar for easier send/receive types through DataChannel 2017-01-31 01:58:09 +01:00
8215d7a4ba Implement TH_API functions from the set 2 (#49) 2017-01-31 01:58:09 +01:00
5aaa220d84 Thd functions v3 (#46) 2017-01-31 01:58:09 +01:00
12c16ab9bc Remaining storage functions implemented 2017-01-31 01:58:09 +01:00
76520512e7 DataChannel tests rewrite (#42); DataChannel isend and irecv implementation (#44) 2017-01-31 01:58:09 +01:00
66de965882 Replace ZeroMQ (#41) 2017-01-31 01:58:09 +01:00
10d32fb0b7 Fix DataChannel tests failure (#43)
Tests failed due to accessing reference which could be invalid.
2017-01-31 01:58:09 +01:00
e72c9b6e4a Storage constructors implemented (#40) 2017-01-31 01:58:09 +01:00
ac1f68127a Add barrier, scatter, gather and allGather implementations + groups (#34) 2017-01-31 01:58:09 +01:00
60d1852c7b Major improvements to master-worker mode
* Fixed all undefined symbol errors
* Implemented storage interface and THStorage class
* RPC improvements
* Code refactor
2017-01-31 01:58:09 +01:00
d53eb521fc Add missing headers. 2017-01-31 01:58:09 +01:00
9808932f10 Refactor RPC and change TensorType to Type 2017-01-31 01:58:09 +01:00
ea876eb6d5 Add initial bindings for master-worker mode 2017-01-31 01:58:09 +01:00
0a45864866 Add THDStorage and improve master-worker mode implementation 2017-01-31 01:58:09 +01:00
2560b39796 Merge TensorTypeTraits.hpp with TensorTraits.hpp 2017-01-31 01:58:09 +01:00
21afa4c88b Worker handling for constructors + destructor 2017-01-31 01:58:09 +01:00
9fc3c5e4d2 THDTensor constructors implemented + some minor fixes 2017-01-31 01:58:09 +01:00
3e3501c98d Integration tests of the THD Python interface (#28) 2017-01-31 01:58:09 +01:00
5e6fcd02b5 Implement data channel groups (#25) 2017-01-31 01:58:09 +01:00
d46ebcfadf Fix broadcast and reduce implementations
Due to bad rank mapping broadcast and reduce were connecting
wrong processes what resulted in errors or not received/sent tensors.

 * Introduced new mapping method to solve this problem.
 * Added and improved tests for this cases.
2017-01-31 01:58:09 +01:00
41480c8cf2 Data channel maintenance 2017-01-31 01:58:09 +01:00
236890d902 Fix transitive library dependencies in CMake 2017-01-31 01:58:09 +01:00
55632d81d2 Add Python wrappers for process group mode 2017-01-31 01:58:09 +01:00
0b276d622e Add reduce and allReduce implementations (#15) 2017-01-31 01:58:09 +01:00
c81491b37d Preserve directory structure when installing headers 2017-01-31 01:58:09 +01:00
42e189425f Detect ZMQ libs and headers in CMake 2017-01-31 01:58:09 +01:00
3cfa0d7199 Expose C API for process group mode 2017-01-31 01:58:09 +01:00
7c9e088661 Reorganize THD directory structure 2017-01-31 01:58:09 +01:00
e78aa4bb84 Implement CommandChannel with ZMQ. 2017-01-31 01:58:09 +01:00
f8e94d0d8b Implement DataChannel (MPI and TCP) (#8) 2017-01-31 01:58:09 +01:00
ebe6f40fce RPC message packing and unpacking implemented 2017-01-31 01:58:09 +01:00
5fb37efb46 Use #pragma once instead of defines 2017-01-31 01:58:09 +01:00
4f47855873 Style improvements 2017-01-31 01:58:09 +01:00
52ae6f682f Add initial version of tensor wrappers 2017-01-31 01:58:09 +01:00
c35f58f97b Template for THD implementation 2017-01-31 01:58:09 +01:00
659b2f3154 Add more autograd functions 2017-01-31 00:39:34 +01:00
5ea05cfb96 Return indices from Variable sort and topk 2017-01-31 00:39:34 +01:00
dc9a5b7d2f Fix memory leak in SpatialMaxUnpooling 2017-01-30 23:23:07 +01:00
f7ab5a128a Delete extra bracket in RNNCellBase.__repr__. (#637)
This extra bracket causes a ValueError when trying to print a Module that uses RNNCellBase or any of its subclasses.
2017-01-29 23:21:24 -05:00
368cbe615d Add Ubuntu 16.04 lib paths in CMake 2017-01-30 01:16:02 +01:00
d4c9a3782b billinear -> bilinear, docs for upsampling, improved docs for Unpooling, pep8 tests fix (#617)
* billinear -> bilinear, docs for upsampling, improved docs for Unpooling, pep8 tests fix
2017-01-30 05:08:48 +05:30
172dca5e8b Fix bug in cat (non-contiguous first input) 2017-01-29 21:25:53 +01:00
818bf0c408 Compile with asserts by default 2017-01-29 21:21:59 +01:00
03dcf8a83b Compile with asserts on by default 2017-01-29 21:18:54 +01:00
604f607fd1 Add asserts in index* functions 2017-01-29 21:18:43 +01:00
956d946c25 Default initial hidden states for recurrent layers (#605)
Fixes #434
2017-01-29 12:38:56 +01:00
970caaa621 Exclude sphinx_rtd_theme from pep8 2017-01-28 23:37:39 -05:00
00a5980cdf Improve RNN doc formatting 2017-01-28 23:37:39 -05:00
e24eee04f0 Link THC to THPP 2017-01-28 23:37:39 -05:00
f1b3af4ee2 Add more bernoulli options in cwrap 2017-01-28 23:37:39 -05:00
fb2d28f477 remove circular references in NestedIOFunction 2017-01-28 23:30:06 +01:00
3a704ff725 Fix legacy load_lua for SpatialConvolution (#608)
* fix legacy load_lua for conv2d

* fix pep8
2017-01-28 20:19:18 +01:00
0180e638e5 Remove unnecessary zero_() calls in cuDNN RNN 2017-01-28 14:36:57 +01:00
95c6ae04fb Fix non-contiguous grad handling in cuDNN RNN 2017-01-28 14:36:57 +01:00
27c4c6e0af Merge commit '6ee77b4edd1552d3a9a2e5389ffc351e513a8089' 2017-01-27 17:29:07 -08:00
da17414b3f Merge commit '343d65db91c2419843d36aed5467c2d1374108bc' 2017-01-27 17:16:08 -08:00
be2b27a747 Merge commit '4461ae809043390d5223905cb82b17035c7f9f31' 2017-01-27 17:15:21 -08:00
aec2c8f752 Merge commit 'c45ff2efe64d0face3889194ba6f885fe9cc4d48' 2017-01-27 17:12:13 -08:00
13e34b4679 Fix multiprocessing tests 2017-01-28 01:18:42 +01:00
57373c7c29 Fix docs 2017-01-28 01:16:04 +01:00
79f5bf84e5 [pep8] Potentially breaking docstring changes 2017-01-28 01:15:51 +01:00
3ed720079e [pep8] Fix most remaining lint manually 2017-01-28 01:15:51 +01:00
e7c1e6a8e3 [pep8] Fix most lint automatically with autopep8
Here's the command I used to invoke autopep8 (in parallel!):

    git ls-files | grep '\.py$' | xargs -n1 -P`nproc` autopep8 -i

Several rules are ignored in setup.cfg. The goal is to let autopep8
handle everything which it can handle safely, and to disable any rules
which are tricky or controversial to address. We may want to come back
and re-enable some of these rules later, but I'm trying to make this
patch as safe as possible.

Also configures flake8 to match pep8's behavior.

Also configures TravisCI to check the whole project for lint.
2017-01-28 01:15:51 +01:00
f1d0d73ed7 Fix flaky Sqrt test 2017-01-28 00:45:49 +01:00
9c411513bf Patch distutils crash when linking with ccache 2017-01-28 00:28:33 +01:00
ce78bc898b Fix travis builds and add ccache 2017-01-28 00:28:33 +01:00
887002e932 Add bindings to CUDA tensors and storages in THPP (#615) 2017-01-27 18:15:56 -05:00
31dea5ff23 Small typo in README (#613) 2017-01-27 20:18:36 +01:00
ec4602a973 Fix bad code alignment (#612)
forward *is* a method of the Linear class
2017-01-27 20:16:49 +01:00
a38749d15f Fix cuda notes
Target GPU *is* consisten with source GPU
2017-01-27 19:30:49 +01:00
6ee77b4edd Added cunn support for TemporalRowConvolutionMM (#415)
* Added cunn TemporalRowConvolutionMM support
2017-01-27 13:30:25 -05:00
343d65db91 Rowconv repull (#1120)
* Added TemporalRowConvolutionMM layer, tests, and documentation
2017-01-27 13:29:05 -05:00
6328981fcf cuda implementation of Gated Linear Unit, fixed issues with genericization 2017-01-26 22:56:33 -08:00
a90913105c add make-contiguous in batchnorm backward (#602) 2017-01-26 16:17:39 -05:00
9368596059 legacy.nn Attributes: Add '_gradOutput' to SpatialConvolution. (#600) 2017-01-26 15:00:41 -05:00
80ed795ff1 Minor ffi utils fix 2017-01-26 11:55:49 +01:00
a2938e3d11 add cc 3.0 to nccl (#594) 2017-01-25 22:47:23 -05:00
2ad967dbe4 Fix pep8 in setup.py with "autopep8 -i setup.py" 2017-01-25 22:23:22 -05:00
7415c090ac Check setup.py for pep8 lint on TravisCI 2017-01-25 22:23:22 -05:00
a1fa995044 Fixes and improvements (#593)
* Fix error in ELU backward

* Add --seed flag for testst st

* Add test for BatchNorm eval

* Fix autograd.backward docs

* Support cc flags in cuDNN search

* Fix IndexSelect backward formula
2017-01-25 22:21:49 -05:00
3c2ecc6b15 add dockerfiles (#583)
* add dockerfiles
2017-01-25 17:30:29 -05:00
fa1516d319 Install THCUNN.h and generic/THCUNN.h
The THCApply.cuh is moved to the .cu files so that THCUNN.h can be
compiled by a standard C compiler.
2017-01-25 14:13:17 -08:00
5e26f49db4 Install THNN.h and generic/THNN.h 2017-01-25 14:09:09 -08:00
7694f65120 Revert "Using accreal instead of real in the API" 2017-01-25 16:26:42 -05:00
b5ebf68df1 Revert "Convert real to accreal in libTHCUNN" 2017-01-25 16:13:20 -05:00
aa46055274 Update CI links in README (#579) 2017-01-25 13:58:05 -05:00
2cad802b68 Revert "cuda implementation of Gated Linear Unit" 2017-01-25 13:15:22 -05:00
2d01f384f1 fallback to nn batchnorm on backward-evaluate (#589) 2017-01-25 12:38:57 -05:00
f8d4f980b3 Add upsampling modules and functions 2017-01-24 17:30:50 -05:00
4f5a6c366e Make Variables non-comparable 2017-01-24 17:30:50 -05:00
ecfcf39f30 Improve optimizer serialization
Also, add optimizer.load_state_dict
2017-01-24 17:30:50 -05:00
3975a2676e Fix invalid DECREF in torch.Size constructor 2017-01-24 17:30:50 -05:00
138ee75a3b Fix for target_link_libraries on CMake 2.8 (#581) 2017-01-24 17:26:24 -05:00
0048f228cb Add spatial test for LogSoftmax 2017-01-24 23:24:25 +01:00
2748b920ab make adam have the same lr as lua torch (#576) 2017-01-24 16:35:28 -05:00
a92a2312d4 Add missing fields to read_lua_file for BatchNorm and Linear layers. 2017-01-24 22:09:47 +01:00
945ce5cdb0 Fix math block of GRUCell in docs (#572)
Added a blank space between the beginning of the `.. math::` block, otherwise it is displayed as a code block.
2017-01-24 14:28:56 -05:00
b39de2cbbe Merge pull request #416 from pavanky/half-fixes
Convert real to accreal in libTHCUNN
2017-01-24 12:17:49 -05:00
49a555e0f5 Merge pull request #1109 from pavanky/api
Using accreal instead of real in the API
2017-01-24 12:17:17 -05:00
ce13900148 update From Source instructions 2017-01-24 10:48:25 -05:00
4c77ad6ee4 step_rate -> lr in adadelta (#569) 2017-01-24 10:05:59 -05:00
0bc4246425 adding NLLLoss2d to docs 2017-01-24 09:22:51 -05:00
c45ff2efe6 Merge pull request #915 from pavanky/convert
Macros to convert between real and accreal
2017-01-24 09:14:33 -05:00
99b520cc5d Merge pull request #421 from huihuifan/cudaGLU
cuda implementation of Gated Linear Unit
2017-01-24 09:13:34 -05:00
e05607aee1 Add fall back to implicit GEMM and friends. (#558)
If we can't allocate the workspace for the desired algorithm, we fall
back to a default algorithm which does not require a workspace.
2017-01-24 09:10:39 -05:00
a360ba1734 Add a hint about CUDNN_STATUS_NOT_SUPPORTED 2017-01-24 09:09:30 -05:00
c661b963b9 Add more contiguity checks to cuDNN 2017-01-24 09:09:30 -05:00
e374dc1696 add step rate to adadelta (#568)
Scales `delta` before it is applied to the parameters in order to control the learning rate of the optimizer (inspired from climin optim lib for theano).
Also changed the link to the Adadelta paper to point to the right location.
2017-01-24 08:48:19 -05:00
116e0c7f38 Merge commit '45596d52897fb187701943cb77456ff1e7249989' 2017-01-23 14:37:44 -08:00
45596d5289 Add contiguity checks to THCUNN 2017-01-23 14:17:51 -08:00
342e7b873d fixing THPP cmake for cmake < 3.1 (#559) 2017-01-23 14:47:06 -05:00
00410c4496 Fix broken THNN groups in conv functions 2017-01-22 18:32:51 -05:00
8b9276bbee Fix view bug in Conv1d 2017-01-22 18:32:51 -05:00
3238786ea1 Improve optimizer error messages 2017-01-22 18:32:51 -05:00
07ebbcbcb3 Add Parameter docs 2017-01-22 18:32:51 -05:00
ca555abcf9 fix comments 2017-01-22 18:02:40 -05:00
63893c3fa2 Fix auto-gpu semantics for indexing 2017-01-22 18:02:40 -05:00
f8ae34706e Port L-BFGS from Lua optim 2017-01-22 18:02:40 -05:00
f8e89fbe11 fix docs for torch.nn.functional.conv1d (#536) 2017-01-21 10:41:52 -05:00
30d208010c Fix segfault when a None gradient was given to a hook (#533) 2017-01-21 10:39:35 -05:00
Tom
017c7efb43 Fix typo in LSTMCell documentation 2017-01-21 15:35:48 +01:00
0c69fd559a Fix CUDA sharing across processes (#530) 2017-01-20 18:28:39 -05:00
c991258b93 fix formula for GRU cells 2017-01-20 17:28:57 -05:00
9f89692dcd adding documentation for some lapack functions (#528) 2017-01-20 16:56:37 -05:00
c28575a4eb Fix typo in documentation for autograd 2017-01-20 21:59:33 +01:00
c9db9c2317 Add C++ tensor library (from THD fork) (#526) 2017-01-20 15:23:34 -05:00
16a09304b4 fix documentation of LSTM cell (#525) 2017-01-20 12:01:50 -05:00
58a88d1ac0 Fix doc search and warnings 2017-01-20 11:36:41 +01:00
b740878697 Updated h0,c0 shape in documentation for RNN, LSTM, GRU (#519) 2017-01-20 10:12:44 +01:00
7179002bfb cuda implementation of Gated Linear Unit 2017-01-19 23:01:30 -08:00
43b5be1d78 added c implementation of GatedLinearUnit 2017-01-19 22:18:08 -08:00
173c81c2d2 import package at the beginning 2017-01-20 00:09:22 +01:00
ee4c77c59f Docs improvements (#512)
* Always compile .numpy() for all types

* Add torch.nn.functional docs and hidden headers

* Use sphinx to generate torchvision docs

* Remove unused import in ffi utils
2017-01-19 17:28:49 -05:00
30ec12fdd5 update readme for source installs to make magma dependency optional 2017-01-19 16:20:13 -05:00
269ec0566f fix typo 2017-01-19 14:26:50 -05:00
a0a95c95d4 Add Random Number Generator Docstrings (#506) 2017-01-19 11:10:01 -05:00
1335b7c1da Fix unpooling docs (#492) 2017-01-19 11:08:43 -05:00
6d14ef8083 Update batchnorm docstrings
Add missing full stops, and added blank line for increased clarity on rendered documentation.
2017-01-19 14:15:26 +01:00
26a492acf3 Update docstring for ConvTranspose functions
Transposed convolutions are often (but incorrectly) referred to as Deconvolutional operations. Made mention of this in the docstring to make it easier for people to search for this operation in the documentation.
2017-01-19 13:02:58 +01:00
f2741e8038 format fix (#490) 2017-01-18 21:41:10 -05:00
8d1a6975d2 Fix for non-contiguous from_numpy (#489) 2017-01-18 18:53:13 -05:00
c414bf0aaf Fix handling of unicode in torch._C._add_docstr (#487) 2017-01-18 17:22:30 -05:00
99f4864674 fixed RMSprop initialization (#485)
* fixed RMSprop initialization
2017-01-18 17:05:53 -05:00
784cbeff5b added a non-exhaustive list of contributors 2017-01-18 13:54:56 -05:00
9302f860ae Remove unused file TensorDocstrings.cpp (#481)
Tensor docstrings are created in _tensor_docs.py
2017-01-18 13:34:40 -05:00
ac8a5e7f0d Remove error message assertion (#480)
Depending on how PyTorch is compiled, the source code for DataLoader
might not be fully available which can cause a spurious error in
test_dataloader.py
2017-01-18 13:16:38 -05:00
798fc16bbf add beta tag 2017-01-18 12:21:46 -05:00
0f65c9267d Fix typo 2017-01-18 08:46:04 -08:00
be45231ccb Improve ffi utils (#479)
* Improve ffi utils
2017-01-18 11:17:01 -05:00
279aea683b update conda install command 2017-01-18 10:52:49 -05:00
8aa8f791fc add more torch.* and Tensor docs (#476) 2017-01-18 08:39:33 -05:00
6464e69e21 Docs for torch.Storage (#475) 2017-01-18 03:22:30 -05:00
a93812e4e5 Fix PowConstant (#471) 2017-01-18 01:53:30 -05:00
225f942044 Disable IndexCopy test until #473 is fixed (#474) 2017-01-18 01:18:18 -05:00
d951d5b1cd Fix tensor.cuda(0) when on non-zero device. (#472) 2017-01-18 01:08:37 -05:00
2082ccbf59 More Tensor docs (#470) 2017-01-18 00:42:41 -05:00
473e795277 Fix invalidArguments for functions with tuple outputs, but no other (#468)
arguments.

For example:

   >>> torch.randn(5, 5).geqrf('invalid arg')
   TypeError: geqrf received an invalid combination of arguments - got (str), but expected ()
2017-01-17 23:14:40 -05:00
a09f653f52 Begin to document TensorBase methods (#466) 2017-01-17 21:44:12 -05:00
90fe6dd528 remove spurious pprint 2017-01-17 21:43:38 -05:00
57a2ccf777 PYTORCH_BUILD_VERSION to setup.py 2017-01-17 17:51:16 -08:00
b5f6fdb814 Using accreal instead of real in the API
This is done to be consistent with the changes made to cunn
2017-01-17 16:58:19 -08:00
205b9bc05f fix build_all.sh 2017-01-17 16:55:46 -08:00
14d5d52789 Add placeholder tensor documentation for methods that exist in torch. (#463) 2017-01-17 19:37:47 -05:00
9c218b419f kl_div and docs (#429) 2017-01-17 19:24:01 -05:00
a69d819901 Converting all instances of real to accreal in libTHCUNN
This is because the current version of luaffifb fails to pass
custom structs (i.e. half) as arguments or accept them as return
values.

The accreal parameters are immediately converted to real internally.
This is done to ensure none of the internal code needs to be changed.

This change also removes transform_reals_to_half which is no longer
necessary.

Change-Id: I978151d001de5492576fb0eddfa0608cd4e99149
2017-01-17 16:06:42 -08:00
517fb2f410 Remove free() and retain() from Tensor (#464) 2017-01-17 18:15:11 -05:00
fef2b1526d Adding macros to convert between real and accreal 2017-01-17 15:14:45 -08:00
3719994c96 Remove redundant code in THGenerateAllTypes.h 2017-01-17 15:12:43 -08:00
35c2821d71 Add documentation for methods defined in TensorBase (#462) 2017-01-17 17:40:54 -05:00
e4812b3903 add binary version to setup.py 2017-01-17 14:14:01 -08:00
4cc11066b2 Add torch.utils.data docs and improve notes (#460)
* Add torch.utils.data docs and improve notes
2017-01-17 14:51:05 -05:00
85b64d77b7 Merge pull request #461 from colesbury/visiondocs
Add torchvision reference to docs
2017-01-17 14:50:00 -05:00
db7948d7d5 Add torchvision reference to docs
Some documentation is just copied from the GitHub readme for now.
2017-01-17 11:40:33 -08:00
3d40c0562d improve build_all.sh 2017-01-17 09:49:48 -08:00
146bcc0e70 adding binary build copy option to build_all 2017-01-17 07:52:18 -08:00
8d9f6c2583 Minor fixes to docs 2017-01-17 10:19:14 -05:00
ac32d8b706 fix docs 2017-01-16 21:08:14 -05:00
15c1dad340 Minor fixes and torch.cuda docs 2017-01-16 20:38:14 -05:00
6d8baf7c30 Fix Sphinx warnings 2017-01-16 20:38:14 -05:00
7ced682ff5 Add notes 2017-01-16 20:38:14 -05:00
89cab4f5e6 fix readme language and links 2017-01-16 20:35:08 -05:00
a0afb79898 add pic to readme 2017-01-16 20:15:19 -05:00
d6fa3b3fd5 Deprecate nn.Container in favor of nn.Module 2017-01-16 19:07:37 -05:00
f91bb96071 Remove cmin, cmax and cinv 2017-01-16 19:07:37 -05:00
3b6644d195 Minor README fix 2017-01-17 00:15:06 +01:00
652b468ec2 Readme improvements 2017-01-16 18:05:26 -05:00
af110d37f2 remove old docs 2017-01-16 15:06:08 -05:00
38967568ca Make load_state_dict() more restrictive (#451)
The load_state_dict() function now raises an error if the argument
state_dict has extra keys or is missing keys.

Previously, load_state_dict() ignored extra and missing keys, which made
it hard to notice when you load an invalid state_dict. This could
happen, for example, if you save the state_dict for a DataParallel, but
load it into a single model.

The state_dict() function now only includes the Tensor data from the
paramters, which reduces checkpoint size by not saving gradients.
2017-01-16 13:06:00 -05:00
df79631a72 Fix a mistake in autograd docs 2017-01-16 12:59:47 -05:00
95f0fa8a92 Change .grad attribute of Variables to be a Variable 2017-01-16 12:59:47 -05:00
1c6ff53b60 Make storages unresizable once exported to numpy 2017-01-16 12:59:47 -05:00
1dbf44c00d Add SmoothL1Loss to functional 2017-01-16 12:59:47 -05:00
1259a0648b Make nn containers copyable 2017-01-16 12:59:47 -05:00
b0055f6229 Improve argument checks for long arg options 2017-01-16 12:59:47 -05:00
90040afc44 Fix cwrap option filtering 2017-01-16 12:59:47 -05:00
59bc96bdc2 Check dropout probability 2017-01-16 12:59:47 -05:00
676ffee542 Check params type in optimizers 2017-01-16 12:59:47 -05:00
77136e4c13 Add anything in torch.legacy docs 2017-01-16 12:59:47 -05:00
604e13775f Add optim docs 2017-01-16 12:59:47 -05:00
02380a74e3 Add warnings to multiprocessing docs 2017-01-16 12:59:47 -05:00
4461ae8090 include cstddef for msvc 2017-01-15 23:45:48 +08:00
2b948c42cd Add SpatialAdaptiveAveragePooling. 2017-01-14 19:44:07 -06:00
133c1e927f fix readme, bump version 2017-01-14 17:47:35 -05:00
b2ae054410 Add SpatialAdaptiveAveragePooling. 2017-01-14 15:27:52 -06:00
2290798a83 if nccl is available, do not compile it and load system version 2017-01-14 10:09:48 +01:00
fd600b11a6 Merge commit '2b88d85505d7317f980e69201e72694d6d5905a4' 2017-01-13 15:58:54 -08:00
b5c9f5c4c3 Merge commit 'ca74bb17b8823d74b83433e2743f23e572501c72' 2017-01-13 15:55:19 -08:00
b8a5b1ed8e Merge commit 'e67b525388a5ae11ed243e94bbc25b4934b03a66' 2017-01-13 15:54:49 -08:00
ca74bb17b8 Merge pull request #675 from pavanky/more-atomic-fix
Ensure atomicAdd(double) is visible to host side code
2017-01-13 17:21:39 -05:00
69d8331195 Use functools.partial 2017-01-13 23:10:45 +01:00
eab5c1975c Avoid strict aliasing warning in float/half conversions. 2017-01-13 14:08:25 -08:00
e67b525388 Merge pull request #911 from gchanan/convWarning
Avoid strict aliasing warning in float/half conversions.
2017-01-13 17:06:17 -05:00
5171e56b82 Ensure atomicAdd(double) is visible to host side code
Just replicating behavior of the cuda headers
2017-01-13 14:05:36 -08:00
f467848448 Avoid strict aliasing warning in float/half conversions.
Verified that at least for GCC 4.47 this generates identical code.
2017-01-13 13:58:03 -08:00
7e4ddcfe8a Remove names from register_hook calls (#446)
The register hook calls now return an object that can be used to remove
the hook. For example,

   >>> h = module.register_forward_hook(callback)
   >>> h.remove()  # removes hook

Or as a context manager:

   >>> with module.register_forward_hook(callback):
   ...     pass

This makes it easier for libraries to use hooks without worrying about
name collisions.
2017-01-13 15:57:03 -05:00
3152be5fb3 Add repr to RNNs and Embedding (#428) 2017-01-13 15:53:52 -05:00
b076944dc5 Fix for atomicAdd(double) for CUDA_VERSION < 8000 2017-01-13 12:43:15 -08:00
3a07228509 Add ConvTranspose1d module (#449) 2017-01-13 15:22:57 -05:00
24a2f2e3a0 Add MaxUnpool1d module (#447) 2017-01-13 14:36:25 -05:00
b32dd4a876 add cudnn deb package installation paths to cudnn discovery, add 5.1.10 to load options (#448) 2017-01-13 14:32:23 -05:00
4f4bd81228 Fixes to autograd: (#442)
- Non differentiable outputs could prevent a gradient computation (see
   test_dep_nograd)
 - Crash in backward on variable which doesn't requires_grad (issue
   #438)
 - Stochastic functions could be backproped through multiple times
2017-01-13 13:51:47 -05:00
59b23d79c6 fix cudnn rnn batch_first with tests (#445)
* fix cudnn rnn batch_first with tests
2017-01-13 13:40:27 -05:00
8c14630e35 Fix Tensor.apply_() (#444)
Fixes #411
2017-01-12 21:51:18 -08:00
cc32de8ef9 Fix typos etc. in docs
- replace "long" with the Python type "int"
 - remove "reshape" from torch.rst since torch.reshape is not
   implemented
2017-01-12 21:25:50 -08:00
44696c1375 Fix MaxPool2d on 3D CUDA inputs (#443)
Currently, MaxPool2d returns 4d indices for 3d CUDA inputs, but
correctly returns 3d indices for 3d CPU inputs.
2017-01-12 21:04:25 -08:00
82088a8110 parallelizing catArray to multiple tensors per kernel (#635) 2017-01-12 12:57:30 -08:00
d5e45b2278 Add AvgPool1d which just uses AvgPool2d implementation (#439) 2017-01-12 15:07:11 -05:00
bdfef2975c adding more docs for torch.* functions 2017-01-11 08:19:49 -08:00
b4bb4b64a1 simd.h: really fix the arm64 (i.e. Aarch64) build 2017-01-11 10:07:32 +00:00
3e91c5e1ad Merge pull request #668 from gchanan/thrustalloc
Add THCThrustAllocator.cuh to install files
2017-01-10 19:27:09 -05:00
2b88d85505 Re-route thrust memory allocation to THCudaMalloc / THCudaFree in cunn. 2017-01-10 10:42:29 -08:00
50651970b8 Merge pull request #666 from gchanan/thrustalloc
Re-route thrust memory allocation to THCudaMalloc / THCudaFree
2017-01-10 12:02:51 -05:00
4a8906dd8a Add THCThrustAllocator.cuh to install files to downstream projects can use it. 2017-01-10 09:02:28 -08:00
68e2769a13 Re-route thrust memory allocation to THCudaMalloc / THCudaFree
so it can use the caching allocator.
2017-01-10 08:35:41 -08:00
17c998e99a fixing arm64 build 2017-01-10 00:15:11 -05:00
35758f51f2 Get rid of a few unused imports. 2017-01-09 15:41:58 -08:00
e8102b0a9b fix compiler warning in THCS 2017-01-09 15:19:13 -08:00
04f2bc9aa7 Fix bug in squeeze backward (#425) 2017-01-09 16:29:37 -05:00
d070178dd3 Instantiate 128kb of scratch space in GPU memory per-device by default 2017-01-09 13:21:18 -08:00
c9ec7fad52 Add model_zoo utility torch torch.utils (#424)
This was originally part of a torchvision PR, but I think it will be
useful outside vision, such as for distributing word embeddings.
2017-01-09 13:16:58 -05:00
f0a6ca4d53 BatchNorm fixes (#423)
- don't use cuDNN for half inputs because weight, bias, running_mean,
   etc. are required to be of different type than for THCUNN
 - accept 3D inputs (N,C,L) in BatchNorm1d
 - remove accidental 'use_cudnn=False'
2017-01-09 13:16:51 -05:00
fd92470e23 Add cuDNN bindings for BatchNorm (#421) 2017-01-07 15:35:24 -05:00
8369664445 Minor doc fixes 2017-01-06 21:51:35 +01:00
35e1adfe82 documentation parity with torch7 for catArray impl 2017-01-06 11:55:57 -08:00
eb91fc5e5d Minor fixes to docs (#412) 2017-01-06 10:59:24 -05:00
d186fdb34c Fix THHalf issues with MSVC. 2017-01-05 08:09:09 -08:00
0f04f71b7e fix API reference link 2017-01-05 02:46:19 -05:00
87f1959be7 adding proper categories to torch.rst 2017-01-04 23:20:57 -05:00
a538055e81 fix invalid use of THPUtils_invalidArguments in sparse tensors 2017-01-04 21:47:48 -05:00
0e345aaf6d Fix invalidArguments to take kwargs and out into account (#397) 2017-01-04 19:49:11 -05:00
c976dd339d remove .zero() on grad_input conv and batch_norm 2017-01-05 01:48:50 +01:00
71cef62436 Fix condition for threadArgErrorHandler
Some error handlers may not have any data associated with them
2017-01-04 16:43:31 -08:00
3a29055044 Fix rnn sphynx docs (#405) 2017-01-04 19:17:10 -05:00
59d66e6963 Sparse Library (#333) 2017-01-05 00:43:41 +01:00
46bc43a80f fixing loss layer docs 2017-01-04 18:40:51 -05:00
7fa60b2e44 fixing docs of activations, pixelshuffle, sparse for rst 2017-01-04 18:40:51 -05:00
c78893f912 removing Image: references in nn activation docs 2017-01-04 13:51:56 -05:00
0d2a4e1a9e fix dropout docs for rst 2017-01-04 13:49:43 -05:00
088f14c697 fix batchnorm and linear docs for rst 2017-01-04 13:35:55 -05:00
4bf7be7bd5 fix RNN module docs for rst 2017-01-04 13:22:02 -05:00
b2ab6891c5 fix the rest of Pool module docs for rst 2017-01-04 12:51:55 -05:00
39ab5bcba8 fix MaxPool1d,2d,3d docs for rst 2017-01-04 03:11:48 -05:00
42f131c09f fixing nn.Conv* documentation for rst and adding nn docs to sphinx 2017-01-04 02:11:27 -05:00
89dca6ffdc Add a patch to stop Sphinx from cross-referencing ivar tags 2017-01-03 18:31:08 -05:00
b7f36f93d5 Expand autograd docs and add sections 2017-01-03 18:31:08 -05:00
58320d5082 Add multiprocessing docs 2017-01-03 18:31:08 -05:00
a461804a65 adding docs for more torch.* functions 2017-01-03 18:29:50 -05:00
817f6cc59d adding linspace, logspace, neg and range 2017-01-03 18:29:50 -05:00
108936169c implement more torch.* docs, remove zero, cauchy, log_normal from torch.* docs as they are not stateless 2017-01-03 18:29:50 -05:00
f60ae085e6 Float -> float, Long -> long 2017-01-03 18:29:50 -05:00
85dda09f95 fixed names and other cosmetics 2017-01-03 18:29:50 -05:00
4f479a98d4 fix indentation issue for all examples, add doc for add 2017-01-03 18:29:50 -05:00
35ba948dde add doc for *mm* functions, *mv* functions and addcmul, addcdiv 2017-01-03 18:29:50 -05:00
6b4ed52f10 adding docs for some torch.* functions, removing all, any stateless methods 2017-01-03 18:29:50 -05:00
dcf5f8671c Add __pow__ to Tensor and list additional undocumented functions (#398) 2017-01-03 13:38:44 -05:00
5340291add Update FindARM.cmake
Fix typos
2017-01-03 12:29:06 -05:00
1c6fe58574 Add gather and scatter to autograd 2017-01-02 13:42:59 -05:00
9f2111af73 Rename Variable.no_grad to Variable.detach 2017-01-02 13:42:59 -05:00
2ed6c6d479 Fix leaf Variable handling in autograd 2017-01-02 13:42:59 -05:00
01ac2d3791 Merge commit '1b97f088cb9e42717122795463a800bf3f503adf' 2017-01-02 09:39:45 -08:00
eac687df5a Merge commit '849cbf3a4774727eadb97c27af13bfbdc976a02a' 2017-01-02 09:39:20 -08:00
6a2785aef7 remove link_prefix from linker arguments (#395) 2017-01-02 12:37:52 -05:00
849cbf3a47 small cmake fix 2017-01-01 19:02:33 -05:00
a0c614ece3 unsqueeze instead of view in dataloader 2017-01-01 23:38:54 +01:00
1b97f088cb Merge pull request #651 from pavanky/cat
Adding support for empty tensors in cat, catArray
2017-01-01 12:47:19 -05:00
097399cdeb Merge branch 'master' into contiguous-cat-1d 2017-01-01 12:34:46 -05:00
7ee152881e Merge commit '3074f8eb8103ecdcbbcbb8d49332d9e7d6f3141c' 2017-01-01 01:13:17 -08:00
3074f8eb81 Removing TH_GENERIC_USE_HALF, TH_NATIVE_HALF, TH_GENERIC_NO_MATH (replaced where appropriate with TH_REAL_IS_HALF), removed half from THGenerateAllTypes, added an explicit THGenerateHalfType.h 2017-01-01 00:57:51 -08:00
748208775f Merge commit '5df17050bf82337d13dbd2108bd17922ac38956c' 2017-01-01 00:08:55 -08:00
5df17050bf Revert "TH_GENERIC_USE_HALF=1 by default, half enabled by default" 2017-01-01 01:06:18 -05:00
92df0eb2bf removing unneeded flags in build_all.sh 2016-12-31 20:16:50 -08:00
995195935b Merge commit 'be8376eb883d2f5a466994e024cde44e6adc6130' 2016-12-31 20:10:11 -08:00
be8376eb88 TH_GENERIC_USE_HALF=1 by default, half enabled by default 2016-12-31 20:07:18 -08:00
b650a45b9c fix botched merge in setup.py 2016-12-31 16:55:53 -05:00
8a20e22239 Add torch.stack 2016-12-31 16:25:39 -05:00
7c5014d803 Add torch.split, torch.chunk and change default dim of cat to 0 2016-12-31 16:25:39 -05:00
62ac1b4bdd Implement missing cases of __matmul__ 2016-12-31 16:25:39 -05:00
0633c08ec9 Add is_shared() method for storages and tensors 2016-12-31 16:25:39 -05:00
cf87cc9214 Check valid configurations of Variable flags 2016-12-31 16:25:39 -05:00
f908432eb3 Ensure that Variable's grad is shared between processes 2016-12-31 16:25:39 -05:00
1bd291c57c Fix multiprocessing tests on macOS 2016-12-31 16:25:39 -05:00
b277df6705 Doc css fixes for mobile and large screens (#389) 2016-12-31 12:01:01 -05:00
ec4d597c59 test fix 2016-12-31 11:08:34 -05:00
d2ef49384e Add custom docs stylesheet (#387) 2016-12-31 10:32:00 -05:00
b5dc36f278 explicitly linking against v1 libs to avoid lua-torch conflicts (#386) 2016-12-31 10:30:36 -05:00
41976e2b60 Merge commit '3dac1b9936a62225cf8516d6d7830fe6c83039ae' 2016-12-30 21:07:13 -08:00
3dac1b9936 cmake C flags fix 2016-12-31 00:06:26 -05:00
d2bb56647f Merge commit '224422eed6813c15b3c3b2c0dcd5e0187ec660a1' 2016-12-30 19:51:01 -08:00
224422eed6 cmake fix 2016-12-30 22:50:06 -05:00
3c26f7a205 Merge commit '10f78985e72fb6834b435ac3f8d0890fa6614365' 2016-12-30 19:24:00 -08:00
9ac9809f27 Merge commit 'd8f4d5f91e3680478a6843d49d7295c1165618f0' 2016-12-30 19:23:41 -08:00
7bf6e984ef Merge commit 'dc95f66a954ad18b80f3f649f8e2c8507c048b74' 2016-12-30 19:23:17 -08:00
10f78985e7 adding TH_LIBRARIES and THC_LIBRARIES var to THCUNN cmake 2016-12-30 22:20:29 -05:00
dc95f66a95 adding TH_LIBRARIES var to THC cmake 2016-12-30 22:10:18 -05:00
d8f4d5f91e adding TH_LIBRARIES var to THNN cmake 2016-12-30 22:08:09 -05:00
47f56f0230 Merge commit '43fbdd3b45d4351623a4aa9c8d5e6dba9eac259a' 2016-12-30 17:46:04 -08:00
b4018c4c30 Merge commit '803d0320771365754658ac74587cc082c2a61fa7' 2016-12-30 17:45:45 -08:00
43fbdd3b45 workaround for luarocks 12.04 bug 2016-12-30 20:44:35 -05:00
803d032077 workaround for luarocks 12.04 bug 2016-12-30 20:44:21 -05:00
9d2d884313 Merge commit 'b5cf1d2fc71604f472a07d0181a05a7f09e276c2' 2016-12-30 16:50:25 -08:00
c0600e655a Merge commit 'c1ca9044bd6dccd293471c6caeeeea4ebd97d61b' 2016-12-30 16:49:56 -08:00
671ed89f2a Merge commit '52c2a92013c45afa5df61a68b16695663ee9fab5' 2016-12-30 16:49:29 -08:00
e0372643e1 Merge commit '541ab961d8f9a02bbbe1a06ba25027116ee93c20' 2016-12-30 16:49:05 -08:00
b5cf1d2fc7 adding THCUNN_SO_VERSION 2016-12-30 19:06:23 -05:00
c1ca9044bd add THNN_SO_VERSION 2016-12-30 19:04:31 -05:00
52c2a92013 adding THC_SO_VERSION property 2016-12-30 19:02:50 -05:00
541ab961d8 adding TH_SO_VERSION option 2016-12-30 18:56:59 -05:00
849794cd2c Remove deprecated and unimplemented functions (#383) 2016-12-30 18:37:44 -05:00
f47fa2cb04 use __get_cpuid when available 2016-12-30 18:10:57 -05:00
7a162dd97a Fix outputs of torch.* comparison functions 2016-12-30 23:02:57 +01:00
b123bace1b Rename torch.autograd.functions to torch.autograd._functions 2016-12-30 23:02:57 +01:00
483490cc25 Move PixelShuffle implementation to functional 2016-12-30 23:02:57 +01:00
8d60e39fdc Rename torch.nn.functions to torch.nn._functions 2016-12-30 23:02:57 +01:00
e7dff91cf3 Fix for multinomial autograd function 2016-12-30 23:02:57 +01:00
ab5776449c Add documentation for some torch.xxx functions (#382) 2016-12-30 17:01:47 -05:00
a229582238 Merge pull request #875 from atkayu/add_histc2
Add a new function bhistc to calculate histogram of batch of images only once
2016-12-30 13:41:42 -05:00
a0df8fde62 Merge pull request #592 from joker512/master
fix: cunn can't find cutorch sources
2016-12-30 11:31:57 -05:00
e4a3aa9295 Change container doc to assign child modules via attributes 2016-12-30 15:51:09 +01:00
be98c5d12d Start documenting torch.Tensor (#377) 2016-12-30 01:21:34 -05:00
bc6a71b1f5 Add Function docs 2016-12-30 00:15:06 -05:00
26f1e2ca9c Add basic autograd docs 2016-12-30 00:15:06 -05:00
75d850cfd2 Fix optim docs 2016-12-30 00:15:06 -05:00
f4870ca5c6 Fix nn docs 2016-12-30 00:15:06 -05:00
235d5400e1 Merge pixelshuffle function into module (#375) 2016-12-29 21:38:37 -05:00
491d5ba4fd add new flags to build_all.sh 2016-12-29 18:16:59 -08:00
d42eadfeb9 Merge commit '2975f539ff8ac9b8e07fb2b610bd69a1596d4c3c' 2016-12-29 17:51:34 -08:00
9a40821069 Merge commit '1ac038ab243bb2718b37cbd81eadbfeb2a234252' 2016-12-29 17:51:13 -08:00
2975f539ff sort cuda 8.0+ fix 2016-12-29 17:47:30 -08:00
64ca584199 Fix group support in convolution modules (#374) 2016-12-29 20:01:39 -05:00
5263469e21 Fix handling of zero sizes in caching host allocator 2016-12-29 15:36:49 -08:00
c367e0b64e Support dilated 1d and 3d convolutions (#372)
Fixes #367
2016-12-29 18:20:32 -05:00
183b3aacd2 Hold CuDNN PRNG state between RNN iterations 2016-12-30 00:14:55 +01:00
101950ce92 fix repr in legacy.nn.linear 2016-12-29 17:30:46 -05:00
239ae94389 fix in conv repr 2016-12-29 17:30:46 -05:00
55e850d825 test if modules can be printed with fixes 2016-12-29 17:30:46 -05:00
62af45d99f Basic functional interface (#354) 2016-12-29 22:53:57 +01:00
1ac038ab24 Merge pull request #882 from amrobbins/ppcvectorinstxns
Add support for VSX vector instructions on PPC
2016-12-29 14:24:56 -05:00
77a925ab66 Add THHalfTensor support to cutorch (#655)
* Add THHalfTensor support to cutorch.
2016-12-29 14:23:45 -05:00
d0d33d3ae7 Add support for torch.HalfTensor (#874)
* Add support for torch.HalfTensor.

* Improvements/Simplifications for torch.HalfTensor.

Improvements/Simplifications:
1) Defines half type as TH_Half, so as to not conflict with cutorch
version.  Previously, these were defined as the same "half" type and
required proper ordering of includes to ensure type was only defined
once, which would have affected all downstream projects.
2) No longer generates math functions that are not actually defined
on torch.HalfTensor, e.g. maskedFill, map, etc.
3) Adds tests for all available torch.HalfTensor functions
4) Allows compiling without TH_GENERIC_USE_HALF (so if there's a
problem can just unset that in CMakeLists rather than backing out)
5) Some simplifications: removes a new copy optimization and
some TH_HALF literal definitions

Limitations:
Because match functions are not defined, some "non-math" operators
on torch.HalfTensor give an error message, e.g. __index__/__newindex__
with a ByteTensor apply a mask, but masks aren't implemented.  These
limitations aren't always obvious, (e.g. for documentation purposes),
but they should always give an error message.

* Rename TH_HALF to THHalf.
2016-12-29 14:23:26 -05:00
9b7eceddc8 Accept outputs in out argument 2016-12-29 12:25:59 +01:00
24af02154c Use ForkingPickler for sharing tensor/storages across processes (#344)
This hooks into the (internal) ForkingPickler class in multiprocessing
to reduce tensors, storages, and CUDA events instead of our queue from
joblib. This makes it easier to use the standard multiprocessing classes
in later versions of Python.

This also exposes:

 - Tensor/Storage.share_memory_()
 - Module.share_memory()

These methods move the CPU tensors and storages to shared memory. If
you're using the "fork" method of multiprocessing, these objects can be
directly inherited instead of serialized through a queue.
2016-12-28 20:34:23 -05:00
86ec14e594 Add support for VSX vector instructions on PPC
Added support for the fill, diff, scale, mul and add functions using
PPC CPU vector instructions. These are used in place of the versions
of these functions written for x86, when compiled on PPC.

This fixes a compile failure on PPC
2016-12-28 16:58:09 -06:00
8a29338837 Use cuDNN for Conv3d and ConvTranspose3d (#359)
I've also updated test_nn.py to run marked tests twice: once with cuDNN
enabled and once with it disabled.
2016-12-28 16:14:47 -05:00
29918c6ca5 Copy libnccl.so.1 instead of libnccl.so
Occasionally, my PyTorch checkout gets into a bad state where libnccl.so
does not exist, but the NCCL makefile doesn't build it because
libnccl.so.1 exists. Switch to copying libnccl.so.1 to work around this.
2016-12-28 20:21:31 +01:00
80a44e84dc Change multinomial return type for CUDA 2016-12-28 18:15:17 +01:00
5497b1babb Use TypeError in invalidArguments 2016-12-28 18:15:17 +01:00
bef70aa377 Make type checking more strict and fix topk arguments 2016-12-28 18:15:17 +01:00
0d30f77889 Make variables picklable with protocols <2 2016-12-28 18:15:17 +01:00
e27bb3e993 Minor fixes 2016-12-28 18:15:17 +01:00
179d5efc81 Merge commit '310ec57fd7176e07137ab7bc717f3602b6f53aa5' 2016-12-28 07:33:37 -08:00
b55e38801d rename histc2 to bhistc 2016-12-28 16:26:09 +08:00
e704ec5c6f Merge commit '46f024846698cd8201d6c1804f21bffda15a2069' 2016-12-27 19:12:45 -08:00
6cda6bb34c Merge commit 'd2a93c310292c9427056e02ac7e0d5cca12a04a2' 2016-12-27 19:12:21 -08:00
46f0248466 Use bool for sizeAverage in SoftMarginCriterion 2016-12-28 00:36:11 +01:00
310ec57fd7 Fix typos in THCTensorRandom 2016-12-28 00:16:53 +01:00
cd82b2b869 Implement comparison and logical operators for tensors 2016-12-28 00:04:08 +01:00
126a1cc398 Add Sphinx docs 2016-12-28 00:03:39 +01:00
bf650f05b3 Merge pull request #652 from apaszke/multinomial
Make multinomial return a LongTensor (compatible with CPU version)
2016-12-27 17:54:54 -05:00
f2606a7502 Make multinomial return a LongTensor (compatible with CPU version) 2016-12-27 23:12:12 +01:00
b07fe52ee0 Adding support for empty tensors in cat, catArray 2016-12-27 13:37:42 -08:00
b07358b329 renaming test to avoid dot in test name 2016-12-27 13:34:09 -08:00
2aea8077f9 renaming test to avoid dot in test name 2016-12-27 13:17:04 -08:00
41f9c14297 Merge commit '135687f04a4e4e0722c14f096c9a1fc647c95f07' 2016-12-27 13:12:26 -08:00
135687f04a critical bugfix in storage copy 2016-12-27 13:11:32 -08:00
b140e70b58 Add autograd.backward (#341) 2016-12-26 19:10:35 -05:00
ec987b57f6 removing 3.3, 3.4 from README badges 2016-12-26 14:52:36 -05:00
596677232c Add a different code path for catting contiguous tensors along the first dimension, for speed reasons.
Fix a bug in cat when catting with an empty tensor along first dim (it added an extra dim).
Fix the ambiguous 'catting along last dimension' sentence in the doc and change the behavior to pick the maximum last dimension over all input tensors.
Now empty tensors are allowed.
2016-12-26 10:23:42 -05:00
9d74e139e5 removing 3.3 and 3.4 from travis build 2016-12-25 15:13:13 -05:00
d2a93c3102 remove unused buffer in avg pooling 2016-12-25 20:00:10 +01:00
bc475cad67 Move max pooling construction logic to functions (#343) 2016-12-25 10:28:11 -05:00
45d6212fd2 default args for conv functions 2016-12-25 01:55:00 -05:00
f45d75ed22 make the CUDA-aware tests backoff if CUDA no available 2016-12-24 15:36:00 -05:00
b03407289f Merge commit '55a794e6ec8d01fc8cceee14ce23ec501e517570' 2016-12-24 11:06:27 -08:00
55a794e6ec fixing OpenMP longjmp bugs in *MaxUnpooling 2016-12-24 13:54:43 -05:00
93ed476e7d adding LAPACK double bindings, adding fmod and remainder 2016-12-22 17:36:47 -08:00
10faa303bc Merge commit '6fa371cb0db9f43e3d05746c7e90516975052589' 2016-12-22 17:35:13 -08:00
6fa371cb0d bugfix for qr skinny matrices 2016-12-22 16:29:53 -08:00
18a2691b4b Fix memory leak in THStorage_copyCudaXXX 2016-12-22 13:49:31 -08:00
f7bd3f7932 added pixel shuffle layer + tests
removed duplicate save_for_backward
2016-12-22 21:43:38 +01:00
f8dee4620a add a new function histc2 2016-12-22 10:11:58 +08:00
800e24616a Merge commit 'fa61159dd0bfd9bbb190e1dfbd90a68f4d3c30c8' 2016-12-21 12:40:41 -08:00
d63a435787 Merge commit 'f16a624b35dd28fbd4cdcd3bd08dfc2421c3e2b0' 2016-12-21 12:40:20 -08:00
a9c2809ce3 change the order of cudnn libs 2016-12-21 05:44:16 -08:00
fa61159dd0 cremainder, cfmod implementations (take 2) (#646) 2016-12-20 20:43:07 -05:00
a215e000e9 fix for out of place tests and for non standard I/O pipes 2016-12-20 16:13:24 -08:00
f16a624b35 correctness fixes for mod and remainder for integer type tensors. 2016-12-20 11:41:16 -08:00
61c2896cb8 Merge pull request #638 from pavanky/multinomial_fix
Bugfix for multinomial distribution
2016-12-20 14:08:59 -05:00
22ebc3f205 Revert "Add support for cremainder, cfmod" 2016-12-20 09:35:41 -05:00
8fa9f443ec Merge pull request #641 from killeent/cfuncs
Add support for cremainder, cfmod
2016-12-19 20:49:29 -05:00
bb72ccf1a5 Support CUDA IPC in Python 3 (#203)
CUDA IPC only works with Python 3 using the "spawn" start method. You
can select the start method using the get_context method:

 import torch.multiprocessing as mp
 ctx = mp.get_context('spawn')
 queue = ctx.Queue()
 event = ctx.Event()
2016-12-19 20:42:53 -05:00
2e73456f5c Fix compiler warnings in Tensor.cpp 2016-12-19 20:35:08 -05:00
3e49a2b4b7 Prevent deepcopy from changing Parameters into Variables 2016-12-19 20:35:08 -05:00
4694e4050b Fix printing bug when all values are NaN or inf 2016-12-19 20:35:08 -05:00
59b9eeff49 Expose gather and equals for CUDA tensors 2016-12-19 20:35:08 -05:00
1744fad8c2 Use 'void' for no-arg function 2016-12-19 12:23:17 -08:00
e46d942ca6 Fix double initialization of HalfStorage (#331) 2016-12-19 15:19:41 -05:00
93a6136863 Add support for cremainder, cfmod 2016-12-19 11:25:10 -08:00
230bde94e7 fix about section 2016-12-19 11:00:53 -05:00
20fffc8bb7 Fix torch.is_tensor for half tensors (#322)
Fixes #311
2016-12-19 15:27:47 +01:00
861a3f3a30 avoid shadowing warnings 2016-12-17 14:01:11 -08:00
ee52102943 small change from set to dict 2016-12-17 13:39:04 -08:00
26516f667e Fix multinomial bug and decrease precision of normal test (#325) 2016-12-17 21:40:13 +01:00
5586f48ad5 add cudnn 5.0.5 to supported versions (#321) 2016-12-17 07:57:20 -05:00
cc6e3c92d2 ensure that legacy linear has gradWeight and gradBias fields (#319) 2016-12-17 00:06:58 +01:00
a2ef5782d0 Revert "Bugfix of type in THCTensor macro." 2016-12-16 17:20:57 -05:00
0c1c0e21b8 Bugfix of type in THCTensor macro.
A fix for issue #632.
2016-12-16 15:37:06 -05:00
ffcc38cf05 Deterministic ordering of parameters and buffers. (#317)
Uses the assignment syntax to get deterministic ordering of parameters.
The ordering of parameters using the constructor syntax is
non-deterministic because kwargs use dict() in Python 3.5 and earlier.
2016-12-16 14:45:56 -05:00
cc24b68584 Merge commit 'f413ee087df1a4bbd8b5a9baba83d07ae0729ea0' 2016-12-16 05:29:16 -08:00
8a70067b92 Add support for stochastic functions in autograd (#294) 2016-12-16 13:14:37 +01:00
33b227c45b serialization bug fix (#314) 2016-12-16 12:05:36 +01:00
fb68be952d Bugfix for multinomial distribution
- Ensures the index of the first bin from the cdf is returned.
2016-12-15 16:01:37 -08:00
f413ee087d Add missing free in LookupTable (#400) 2016-12-15 22:17:37 +01:00
6495f5dd30 fix bounds issue in snprintf 2016-12-14 17:11:26 -08:00
8e09f0590b Make sure that C extension was compiled with cuDNN before using it 2016-12-15 00:47:55 +01:00
08d346df9c Print libraries used for building the extension 2016-12-15 00:47:55 +01:00
12cf96e358 Don't change requires_grad of parameters in train() and eval() 2016-12-15 00:47:55 +01:00
765a720d1c Add support for tds.Vec and tds.Hash in load_lua 2016-12-15 00:47:55 +01:00
cace62f94c Fix a bug in narrow docs 2016-12-15 00:47:55 +01:00
767c96850d Return False from torch.cuda.is_available() when no devices are visible 2016-12-15 00:47:55 +01:00
b73e78edbb Check nDimension in t() and t_() 2016-12-15 00:47:55 +01:00
7914cc119d Fix bmm for Variables 2016-12-15 00:47:55 +01:00
2b13eb2a6c Fix naming of setup.py env toggles 2016-12-15 00:47:55 +01:00
8768e64e97 Allow returning changed gradients from the hooks 2016-12-15 00:47:55 +01:00
9212b9ca09 fix wrong export directive for THCCachingHostAllocator (#633)
fix wrong export directive for THCCachingHostAllocator
2016-12-15 00:36:03 +01:00
0d0f197682 Add note on Huber loss (#310) 2016-12-14 21:39:42 +01:00
281e34d1b7 fixes for changes in THNN API 2016-12-13 18:10:07 -08:00
287ba38905 Merge commit 'ed9dbff4e0295dbeb2e8de908cb8c1109c278a8a' 2016-12-13 17:23:56 -08:00
ed9dbff4e0 removing ifdef 2016-12-13 17:22:52 -08:00
6ba4e48521 Merge commit '3adcb2c157ed7df5aaff9b59d4526aa24ec770db' 2016-12-13 16:49:38 -08:00
b7269f2295 Merge commit '220183ed783101f19d88cb8fb3052fd4abc7234f' 2016-12-13 16:49:15 -08:00
5ab317d4a6 Merge commit '258c9ffb2c2d23a06b153aa9161a88ad930cfbbc' 2016-12-13 16:48:45 -08:00
431bcf7afa Merge commit '56245426ebcf239363867905ca2a4cea676dd45d' 2016-12-13 16:48:16 -08:00
41909e8c5b adding a couple more imports 2016-12-13 16:47:00 -08:00
56245426eb small fixes to allocator 2016-12-13 16:45:01 -08:00
3adcb2c157 Check that batch size matches the target size in ClassNLLCriterion (#399) 2016-12-14 00:25:05 +01:00
6d12185cc9 Fixed compilation on Raspberry PI without NEON 2016-12-13 17:30:54 -05:00
258c9ffb2c Implement bernoulli with element-wise probabilities for all types 2016-12-13 11:10:28 -08:00
dede431dd9 More state_dict fixes (#305)
In #304 I forgot even more...

I did a repo search and this time it should be all.
2016-12-13 13:59:06 +01:00
6312d29d80 Another documentation change, missed one in #303 (#304)
Apparently load_parameter_dict was also renamed to load_state_dict
2016-12-13 12:47:40 +01:00
ab5f26545b Correct documentation to be in line with #237 (#303)
.parameter_dict was renamed to .state_dict in #237

This documentation change reflects that.
2016-12-13 12:32:42 +01:00
6567c1342d small doc fixes 2016-12-12 23:51:54 +01:00
3d6c2e023c TensorInfo related code documentation 2016-12-12 10:06:13 -08:00
89d930335b fix tests for GPU-less setup (#298) 2016-12-12 10:56:57 +01:00
04393cd47d fix gcc-6 build on os x (#297) 2016-12-12 00:01:15 +01:00
28f0cf6cee Add docstring support to cwrap (#295) 2016-12-11 23:25:14 +01:00
1af9a9637f Refactor copy and release GIL during copy (#286) 2016-12-11 21:54:58 +01:00
1031d671fb legacy fixes (#287) 2016-12-11 20:13:48 +01:00
2a974f5ca2 Fix 1.3.2 compilation 2016-12-08 09:11:43 -08:00
ee91b22317 Merge pull request #394 from gchanan/volumShapeChecks
Improve Volumetric shape checking.
2016-12-07 02:07:22 +01:00
220183ed78 Improve gradOutput checks for VolumetricReplicationPadding. 2016-12-06 09:09:38 -08:00
504d2ca171 Improve gradOutput check for VolumetricMaxUnpooling. 2016-12-06 09:09:27 -08:00
d535aa94a1 Improve shape checks for VolumetricDilatedConvolution, VolumetricConvolutionMM,
VolumetricFullConvolution.

Also add some additional checks for SpatialFullConvolution.
2016-12-06 09:06:07 -08:00
0376a1909b Improve shape checks for VolumetricAveragePooling, VolumetricDilatedMaxPooling,
VolumetricMaxUnpooling, VolumetricReplicationPadding.
2016-12-06 09:06:03 -08:00
f757077780 Improve shape checks for VolumetricMaxPooling and VolumetricDilatedMaxPooling. 2016-12-06 09:05:59 -08:00
648e9fbb58 Adding missing file 2016-12-05 18:06:24 -08:00
9f7114a4a1 Improve shape checks for VolumetricDilatedConvolution, VolumetricConvolution,
VolumetricFullConvolution.

Also add some additional checks for SpatialFullConvolution.
2016-12-05 12:22:04 -08:00
7d03da0890 Improve shape checks for VolumetricAveragePooling,
VolumetricMaxUnpooling, VolumetricReplicationPadding.
2016-12-05 09:31:00 -08:00
4e0cecae7f Improve shape checks for VolumetricMaxPooling and VolumetricDilatedMaxPooling. 2016-12-05 08:20:19 -08:00
72dbb76a15 fix half type numerics issue in SpatialFractionalMaxPooling 2016-12-02 16:33:27 -08:00
cceb926af3 Remove extra size check in SpatialAveragePooling. 2016-12-02 15:36:11 -08:00
0d7d29fa57 Enable caching allocator for CUDA pinned memory (#275)
Also add binding for CUDA "sleep" kernel
2016-12-02 01:33:56 -05:00
be3276fcdd Account for batch_size in DataLoader.__len__() (#277) 2016-12-02 01:21:36 -05:00
09c94a170c Merge commit 'f2a18004a77f146bb5b431715402f4afd3cacccd' 2016-12-01 22:16:58 -08:00
f2a18004a7 Process outstanding CUDA events in recordEvent
Without this, the cuda_events could continuously grow from calls to
cudaMemcpyAsync, but would never be processed if there were no new
pinned memory allocations.

For example:

 t1 = cutorch.createCudaHostTensor(10)
 t2 = torch.CudaTensor(10)
 while true do t2:copyAsync(t1) end
2016-12-01 19:09:47 -08:00
1a3ff1bd28 Remove unnecessary shape checks in Spatial Pooling modules.
Checks comparing input image sizes to kernel sizes are superseded
by output size checks.
2016-12-01 15:49:53 -08:00
a5d3c779c7 Add gradOutput shape checks in temporal modules. 2016-12-01 15:49:48 -08:00
9d32e60dc2 Fix spacing in SpatialDilatedMaxPooling. 2016-12-01 15:49:41 -08:00
f6913f56ea Remove unnecessary shape checks in Spatial Pooling modules.
Checks comparing input image sizes to kernel sizes are superseded
by output size checks.
2016-12-01 15:38:51 -08:00
801fe8408f Add gradOutput shape checks in Temporal modules. 2016-12-01 15:37:59 -08:00
cf4a979836 Improve shape checking for Temporal Convolution. 2016-12-01 15:37:49 -08:00
34d27771c6 1.3.2 release
Broadcast tuning
Better checking of inputs
Copy/reduce code simplification
2016-12-01 15:17:50 -08:00
1093821c33 Replace min BW by average BW in tests 2016-12-01 15:16:35 -08:00
91f2946310 Import most common packages by default 2016-12-01 23:14:41 +01:00
2bd7a3c31d Don't raise an error when retrieval of container's source code fails 2016-12-01 23:14:41 +01:00
a681f6759b Raise correct error types when indexing tensors 2016-12-01 23:14:41 +01:00
cb849524f3 Improve cuDNN detection at build time 2016-12-01 23:14:41 +01:00
1f5951693a Change torch.randperm to return Long tensors 2016-12-01 23:14:41 +01:00
87748ffd4c Add .type() for torch.nn modules 2016-12-01 23:14:41 +01:00
0580f5a928 Add __len__ for tensors 2016-12-01 23:14:41 +01:00
88d9fdec2e Add torch.cuda.set_device 2016-12-01 23:14:41 +01:00
506a40ce44 Remove optim submodule attributes from torch.optim package 2016-12-01 23:14:41 +01:00
bf0e185bd6 Merge commit 'bb1019d1ec1503718b97d17366902f96f349f472' 2016-12-01 13:47:20 -08:00
5b3ccec10d Merge commit 'c2d32030a25e352eb2e2af26931163c0f4c96b36' 2016-12-01 13:46:35 -08:00
eb07581502 Merge commit 'bec6ab47b6782f60925e306b69e0f556274fb28e' 2016-12-01 13:46:03 -08:00
934a2b6878 Merge commit 'b27d4de850b5f43829bd4980f5e7f3b4b32ab7cf' 2016-12-01 13:45:05 -08:00
bec6ab47b6 Add caching allocator for pinned (host) memory
Adds a caching allocator for CUDA pinned (page-locked) memory. This
avoid synchronization due to cudaFreeHost or cudaHostUnregister at the
expense of potentially higher host memory usage.

Correctness is preserved by recording CUDA events after each
cudaMemcpyAsync involving the pinned memory. The pinned memory
allocations are not reused until all events associated with it have
completed.
2016-12-01 13:35:12 -08:00
49480f1548 Adds a CUDA "sleep" kernel
Adds a CUDA "sleep" kernel which spins for the given number of
iterations. This is useful for testing correct synchronization with
streams.
2016-12-01 12:45:07 -08:00
18a3c62d9b Allow NoneType for parameters in Module.load_state_dict 2016-12-01 20:12:15 +01:00
6322cf3234 Allow device=None in Tensor constructor"
Setting device=None is the same as not specifying the device (use the
current active device).
2016-12-01 20:09:19 +01:00
4e2b154342 update install command from source 2016-12-01 10:55:04 +01:00
bb1019d1ec Add newContiguous calls that have been removed from lua. 2016-11-30 13:58:22 -08:00
c2d32030a2 Move make contiguous code from lua to C.
Exceptions are:
1) SparseLinear
requires additional parameters to be passed in (e.g. nbatches),
so it's not clear it's worth moving to C since it won't really simplify the binding
code logic.

2) BatchNormalization
requires "makeBatch", which isn't a trivial translation to C.

3) LookupTable
requires "view" in C, which is already a TODO

4) SpatialUpSamplingBilinear
requires "view" in C, which is already TODO
2016-11-30 13:45:16 -08:00
162170fd7b Add optional weight decay to optim.SGD (#269) 2016-11-29 20:35:40 -05:00
ea728e7c5e Add DataParallel container (#268)
Adds a container version of the `data_parallel` function. This is a
drop-in replacement for the DataParallel class in the ImageNet example.
2016-11-29 16:36:01 -05:00
aea6ba4bcd Support pinned memory in the DataLoader (#265)
DataLoader now supports the constructor argument 'pin_memory'. When set
to true, tensors in the sample are copied to pinned memory. This happens
in a background thread when num_workers > 1.
2016-11-29 12:35:03 -05:00
ab357c14fc Merge pull request #1051 from gchanan/temporalShapeCheck
Improve error messages/shape checks for temporal modules.
2016-11-28 13:51:16 -06:00
606aa43da0 Merge pull request #383 from gchanan/TemporalShapeCheck
Improve error messages/shape check in TemporalMaxPooling.
2016-11-28 13:50:59 -06:00
8bfa802665 Improve error messages/shape check in TemporalMaxPooling. 2016-11-28 11:46:26 -08:00
ff5b73c0b3 Improve error messages/shape checks for temporal modules. 2016-11-28 11:19:00 -08:00
ddddfba1c0 Merge pull request #54 from peterhj/peterhj-staticlib
Add a static library target "staticlib" to the Makefile.
2016-11-28 09:15:39 -08:00
86c95014a4 use local modified select_compute_arch.cmake for msvc 2016-11-28 14:02:21 +08:00
288c950c5e use local modified select_compute_arch.cmake for msvc 2016-11-28 13:23:24 +08:00
b27d4de850 changes to compile with msvc 2016-11-28 10:27:36 +08:00
61063ebade Merge commit 'a7f24ccb7635447b133011d39e36279be140149e' 2016-11-26 09:13:12 -08:00
3e70e26278 Merge commit '08a1bc71c0712a4151de83d1487a55b218ae1a15' 2016-11-26 09:12:53 -08:00
66e7e42800 Merge commit '379860e457dbb72c0f18e0366e5b199452b302f5' 2016-11-26 09:12:24 -08:00
0fecec14b8 fixing bug in indexing when given float indices 2016-11-26 11:50:56 -05:00
a7f24ccb76 Fix shapeCheck in Spatial Pooling modules 2016-11-26 17:41:59 +01:00
08a1bc71c0 Fix shapeCheck in Spatial Pooling modules 2016-11-26 15:00:32 +01:00
04e896a4b4 adding coverage support for tests 2016-11-26 00:26:30 -05:00
5dcfb80b36 lua serializer registers CUDA classes only when CUDA is available 2016-11-26 00:26:30 -05:00
9da60c39ce Fix batch_first in AutogradRNN (#255) 2016-11-25 23:55:45 -05:00
379860e457 Lazily initialize CUDA devices
Previously, cutorch would initialize every CUDA device and enable P2P
access between all pairs. This slows down start-up, especially with 8
devices. Now, THCudaInit does not initialize any devices and P2P access
is enabled lazily. Setting the random number generator seed also does
not initialize the device until random numbers are actually used.
2016-11-25 15:22:16 -08:00
bcfa2d6c79 Add .t7 file reader 2016-11-25 00:41:55 +01:00
8b492bbc47 Return accreal as correct python types 2016-11-25 00:40:36 +01:00
a49b7b0f58 Fix bug when Variable constructor didn't set the error properly 2016-11-25 00:40:36 +01:00
c781ac414a Unify signatures of max, mean, etc. between variables and tensors 2016-11-25 00:40:36 +01:00
656dca6edb Implement in-place operators for variables 2016-11-25 00:40:36 +01:00
830adfd151 Allow passing torch.Size to expand 2016-11-25 00:40:36 +01:00
6f7c8e4ef8 Fix bug when passing 0 as dim to max, min, mode, median and kthvalue 2016-11-25 00:40:36 +01:00
5765d608cc Add a static library target "staticlib" to the Makefile.
Rename the static library "libnccl_static.a" to disambiguate from the
dynamic libraries.
2016-11-24 11:31:03 -08:00
2ba6678766 Revert "Lazily initialize CUDA devices" 2016-11-23 19:40:03 -05:00
71a47d1bed Merge pull request #610 from colesbury/lazy
Lazily initialize CUDA devices
2016-11-23 17:48:00 -05:00
51bf6321ea Implemented cudaMemGetInfo for caching allocator (#600)
* Implemented cudaMemGetInfo for caching allocator
2016-11-23 17:38:57 -05:00
aa8916e7c6 Don't unpack single element tuples returned by functions 2016-11-23 18:48:41 +01:00
2e24da2a0b Change parameter_dict to state_dict in torch.nn 2016-11-23 18:48:41 +01:00
c94ccafb61 Print error message when constructing a tensor from a numpy array with negative strides 2016-11-23 18:48:41 +01:00
80a827d3da Fix data_parallel bugs 2016-11-23 18:48:41 +01:00
6909c8da48 Use TH_INDEX_BASE for range asserts in MultiLabelMarginCriterion 2016-11-23 13:26:16 +01:00
c07105a796 fix cwrap for changed signatures 2016-11-22 14:27:41 -08:00
c40c061a9f Lazily initialize CUDA devices
Previously, cutorch would initialize every CUDA device and enable P2P
access between all pairs. This slows down start-up, especially with 8
devices. Now, THCudaInit does not initialize any devices and P2P access
is enabled lazily. Setting the random number generator seed also does
not initialize the device until random numbers are actually used.
2016-11-22 13:43:25 -08:00
a9bd27ce5c Merge commit '709255d9952783eed6c8f84e504693f9b436f852' 2016-11-22 13:26:09 -08:00
2e36c4ea2d Merge commit 'f3cb636294fbd0e15dd4b3bfdca16e73d1dca38b' 2016-11-22 13:25:53 -08:00
4e45385a8d Merge commit 'b27f576f29189ca78dd670cbd177bfa29b695c50' 2016-11-22 13:25:29 -08:00
cf5e925c10 Merge commit 'f6b94dd830c06692cd78addd41868a7a12c48755' 2016-11-22 13:25:00 -08:00
709255d995 added shape checks for SpatialAveragePooling 2016-11-22 13:23:16 -08:00
f3cb636294 refactoring and adding additional shape checks for SpatialAveragePooling 2016-11-22 13:08:58 -08:00
e3f440b1d0 Make torch.backends.cudnn work on OSX 2016-11-22 19:06:08 +01:00
f6b94dd830 Add some documentation for APPLY and DIM_APPLY macros 2016-11-21 14:02:33 -08:00
3911a1d395 Fix memory leak in LogSoftMax 2016-11-21 21:32:10 +01:00
ebd3648fd6 Call newContiguous rather than arg checking isContiguous. 2016-11-21 21:32:10 +01:00
f698f09cb7 Add contiguous checking / make tensors contiguous for
SpatialUpSamplingBilinear, PReLU, SpatialSubSampling, TemporalConvolution.
2016-11-21 21:32:10 +01:00
86aa5dae05 Move VolumetricConvolution contiguous code from lua to C. 2016-11-21 21:32:10 +01:00
179c82ffb4 Autograd functions no longer store references to saved_variables
Only references to their data and version counters are stored.
Also, it is now possible to have None arguments in save_for_backward
and return too many values from backward (as long as the excessive
results are None).
2016-11-21 19:39:55 +01:00
233017f01f Add torch.multinomial for CUDA 2016-11-21 19:39:55 +01:00
c2c515516b Remove irrelevant output from ncclReduce Fortran tests 2016-11-21 10:18:04 -08:00
9c18468fe2 Add Copyright header to Fortran bindings source files 2016-11-21 10:17:58 -08:00
597bbfeacd SpatialConvolutionLocal uses baddbmm 2016-11-21 09:10:26 -08:00
99a169c17e Fix memory leak in LogSoftMax 2016-11-19 23:44:31 +01:00
0613ac90cd string.split and string.join removed for .split and .join 2016-11-18 16:23:34 -08:00
78871d829a Call PyObject_GC_UnTrack from tp_dealloc handler (#231)
Without the PyObject_GC_UnTrack call, the tp_dealloc handler could get
called twice if a referred to object triggers a garbage collection from
its destructor.

See http://bugs.python.org/issue28737
2016-11-18 14:06:35 -05:00
d40a7bf9eb Fix Scatter.backward() (#232) 2016-11-18 13:58:09 -05:00
b27f576f29 guard random functions for half 2016-11-18 09:32:32 -08:00
073dfd8b88 bump version 2016-11-18 12:26:12 -05:00
509dd57c2e tensor docs 2016-11-18 04:00:27 -05:00
7a837b7a14 fixing nn docs to be categorized, and optim docs 2016-11-18 03:18:48 -05:00
dee864116a optim docs 2016-11-17 21:09:17 -05:00
5f2b32e45b Add Fortran bindings 2016-11-17 15:33:34 -08:00
e51d0bef97 Add cuDNN bindings for 2D transposed convolution 2016-11-17 14:34:40 -08:00
2fd78112ab Add half copy/conversions 2016-11-17 14:34:33 -08:00
5c14bd2888 Merge pull request #605 from gchanan/halfAddrAddmv
Add half support for addmv and addr.
2016-11-17 14:33:31 -08:00
84b4665e02 Add half support for addmv and addr. 2016-11-17 14:27:56 -08:00
26d626a47c adding docs for loss functions, container, module and fix typos 2016-11-17 15:11:27 -05:00
6ff6299c65 fix memory leak in (equal) 2016-11-16 15:43:57 -08:00
071e68d99d fixing output size w / h order 2016-11-16 15:32:18 -08:00
78c1094d93 Don't override __call__ in modules 2016-11-16 15:32:18 -08:00
56fc639c9f Fix no bias mode of autogenerated THNN function 2016-11-16 15:32:18 -08:00
51084a9054 Merge pull request #603 from killeent/remainder
Implement fmod, remainder, equal in Cutorch
2016-11-16 15:20:57 -08:00
f8ae5c93e9 enables random functions for float and half types on cuda (#223) 2016-11-16 15:14:26 -08:00
ad286c0692 add support for equal in cutorch 2016-11-16 14:41:59 -08:00
a483b3903d Merge pull request #377 from gchanan/checkContiguous
Add contiguous checks / auto contiguous
2016-11-16 10:35:11 -08:00
6564d39777 Call newContiguous for tensors that are required to be contiguous.
Also add tests to verify that non-contiguous tensors are handled correctly.
2016-11-16 09:50:11 -08:00
8f1b7230fe add support for fmod in cutorch 2016-11-16 08:35:17 -08:00
c0b7608965 add support for remainder in cutorch 2016-11-16 08:12:44 -08:00
56dd4132c4 add MACOSX_DEPLOYMENT_TARGET to instructions 2016-11-16 10:45:56 -05:00
91494cb496 Call newContiguous rather than arg checking isContiguous. 2016-11-15 16:16:08 -08:00
9057eade95 Handle contiguousness and improve shape checks
in SpatialAdaptiveMaxPooling, SpatialUpSamplingNearest, and TemporalConvolution.
2016-11-15 14:17:45 -08:00
a28317b263 SpatialSubSampling contiguous check. 2016-11-15 14:16:48 -08:00
25c3603266 VolumetricConvolution check contiguous. 2016-11-15 14:15:55 -08:00
ae6f2dd11c Adapt nn code to changes in THNN and THCUNN 2016-11-15 23:02:14 +01:00
3aaa1771d5 [cutorch mag2gen] more cleanup 2016-11-15 13:31:57 -08:00
2034396a3c [cutorch mag2gen] some cleanup 2016-11-15 13:31:57 -08:00
0cad668065 [cutorch mag2gen] move qr to generic 2016-11-15 13:31:57 -08:00
f644a11b82 [cutorch mag2gen] move potr* to generic 2016-11-15 13:31:32 -08:00
d7e3b2ef29 [cutorch mag2gen] move inverse to generic 2016-11-15 13:31:32 -08:00
fc5ec87478 [cutorch mag2gen] move svd to generic 2016-11-15 13:31:32 -08:00
ed4023127b [cutorch mag2gen] move eig to generic 2016-11-15 13:31:32 -08:00
2bd4e5f5f6 [cutorch mag2gen] move symeig to generic 2016-11-15 13:31:32 -08:00
d2dcbc26f8 [cutorch mag2gen] move gels to generic 2016-11-15 13:31:32 -08:00
2f05eefe9a [cutorch mag2gen] code refactor to support generics; move gesv to generic 2016-11-15 13:31:32 -08:00
7d1afa78b9 [cutorch mag2gen] generic MAGMA memory allocator function 2016-11-15 13:30:49 -08:00
dac9b020e0 [cutorch potr*] API parity for potr* functions in cutorch 2016-11-15 13:28:37 -08:00
eb77b79df9 Merge pull request #839 from Atcold/fix_ASIMD
Fix compilation for ASIMD, fix #766
2016-11-15 12:57:57 -08:00
456998f043 Merge commit 'aeed8a6ea4650d1092289a60e71d8d83875a0ba6' 2016-11-15 12:55:11 -08:00
c09f07edd9 Merge commit 'c82537462baa715b2c70726f7da8f734b2ad3a3f' 2016-11-15 12:53:29 -08:00
66320c498c Add contiguous checking / make tensors contiguous for
SpatialUpSamplingBilinear, PReLU, SpatialSubSampling, TemporalConvolution.
2016-11-15 12:50:08 -08:00
8cb8a0a146 Move VolumetricConvolution contiguous code from lua to C. 2016-11-15 12:23:09 -08:00
aeed8a6ea4 Remove duplicate entries and add optional marks in THCUNN.h 2016-11-15 21:22:14 +01:00
c82537462b [cutorch] remove syncing point from baddbmm
This change removes HtoD copies inside baddbmm. These copies
introduce a syncing point which causes slow downs in a multi
gpu training.

Test plan: Run unittests for baddbmm.
2016-11-15 11:55:36 -08:00
a8a02ff560 Fix compilation for ASIMD
On ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo
Replace assembly with C

Original authors:
 - @dusty-nv
    FindARM-patch.txt
    CMakeLists-patch.txt
 - @rtarquini
    NEON.c
2016-11-15 14:38:32 -05:00
72a9df19c8 Merge pull request #598 from killeent/rr2
move random functions to generic (attempt 2)
2016-11-14 11:44:41 -05:00
5b9b9634f9 [cutorch rand2gen] various fixes 2016-11-14 08:13:30 -08:00
c279a91c03 Merge commit '64c8a1377335799b322ca41d323dee13118be0ab' 2016-11-13 21:54:27 -08:00
ef6a764509 Merge commit '1cee5a359c2828800db0c41ebe0108bd5eef9501' 2016-11-13 15:23:11 -08:00
4db5afdf7e Merge commit 'f2daa616d105d700b63f05c4d544befb6e65a036' 2016-11-13 15:20:03 -08:00
7867187451 Merge commit '4f8e6ec42abd5b9b5491a49bdfe1a637e6675207' 2016-11-13 15:19:10 -08:00
4f8e6ec42a [PATCH] Improve potrf error message. (#189) 2016-11-13 15:17:05 -08:00
64c8a13773 Remove comment. 2016-11-11 15:46:44 -08:00
395ab4a287 Fix SpatialDilatedMaxPooling shape check.
In nn, indices are 3d, but they are 4d in cunn.
2016-11-11 15:43:54 -08:00
15dc862056 more improvements on error messages and shape checks. 2016-11-11 15:43:49 -08:00
f2daa616d1 Revert "Move random functions to generic" 2016-11-11 18:15:01 -05:00
64a50f5ad3 Merge pull request #589 from killeent/random-refactor
Move random functions to generic
2016-11-11 17:56:39 -05:00
1d0f86144c [cutorch rand2gen] fix illegal memory access in multinomial code, update unit tests 2016-11-11 13:23:03 -08:00
89e93bba9d [cutorch rand2gen] test fixes, add floor to geometric distribution transform 2016-11-11 13:23:02 -08:00
3290d4c7d6 [cutorch rand2gen] extend functions to use _double methods 2016-11-11 13:23:02 -08:00
ca22befc93 [cutorch rand2gen] move randn to generic 2016-11-11 13:23:02 -08:00
b08df5b9c0 [cutorch rand2gen] partial move of logNormal to generic, needs further debugging 2016-11-11 13:23:01 -08:00
ebd3c3291c [cutorch rand2gen] move geometric to generic 2016-11-11 13:23:01 -08:00
16728d2f26 [cutorch rand2gen] move multinomial to generic 2016-11-11 13:23:00 -08:00
34dab66f44 [cutorch rand2gen] move cauchy to generic 2016-11-11 13:22:59 -08:00
3a111c7499 [cutorch rand2gen] move exponential to generic 2016-11-11 13:22:59 -08:00
3600c94ec5 [cutorch rand2gen] move normal to generic 2016-11-11 13:22:58 -08:00
e2f8b00e00 [cutorch rand2gen] move bernoulli to generic 2016-11-11 13:22:58 -08:00
65ed1eba48 [cutorch rand2gen] move uniform, rand to generic 2016-11-11 13:22:57 -08:00
7fff7977fe [cutorch rand2gen] make sampleMultinomialWithRoutReplacement utility function generic 2016-11-11 13:22:57 -08:00
add5922aac [cutorch rand2gen] make sampleMultinomialWithReplacement utility function generic 2016-11-11 13:22:56 -08:00
a94b54a533 [cutorch rand2gen] make sampleMultinomialOnce utility function generic 2016-11-11 13:22:56 -08:00
bea82b9da6 [cutorch rand2gen] make renormRowsL1 utility function generic 2016-11-11 13:22:56 -08:00
2e7debe282 [cutorch rand2gen] introduce THCTensorRandom.cuh, move and templatize simple binary search function 2016-11-11 13:22:55 -08:00
1cee5a359c Fix checking and spacing of dilation parameters in SpatialDilatedConvolution
and SpatialDilatedMaxPooling.
2016-11-11 10:25:44 -08:00
b08862405e Remove extraneous shape check from SpatialDilatedConvolution. (#1029) 2016-11-11 12:53:48 -05:00
d57e1a6756 change to compile with msvc && export THCDescBuff for cunn 2016-11-11 13:56:13 +08:00
c9172c5bc9 change to work on windows && ptrdiff_t replacement 2016-11-11 13:33:36 +08:00
5d5e877a05 Fix implementation of logNormal 2016-11-10 18:35:45 -08:00
1e794c87ae adding bidirectional doc 2016-11-10 17:38:47 -08:00
d9cb1b545a Fix build on 32bit platform like JETSON TK1 2016-11-11 00:22:06 +00:00
23f611f14d Rename assertSameGPU_generic to assertSameGPU.
Also remove old assertSameGPU since there is no
longer both generic and non-generic support.
2016-11-10 15:40:41 -08:00
42b28d0d69 Merge pull request #370 from gchanan/sizeCheckErrorMessages
Improving error messages in nn.
2016-11-10 18:35:22 -05:00
d0cf5f7b65 Improving error messages in nn.
Differences from nn equivalent:
1) No changes to VolumetricConvolutionMM, which doesn't exist in cunn.
2) No changes to HardShrink, which doesn't  exist in cunn.
3) LookupTable doesn't verify that all inputs are within range.
2016-11-10 15:12:35 -08:00
4699c817e8 [cutorch rand2gen] fix illegal memory access in multinomial code, update unit tests 2016-11-10 15:10:12 -08:00
4f490c16e9 [cutorch rand2gen] test fixes, add floor to geometric distribution transform 2016-11-10 13:44:55 -08:00
bcdab7a632 Remove mul/div from THCHalfAutoNumerics as they've been moved to
THCNumerics.
2016-11-10 12:13:41 -08:00
7f51af7cbc adding dropout, bidirection, etc. to RNN (#214) 2016-11-10 13:25:14 -05:00
b4ae60cac8 Protect half operations with CUDA_HALF_TENSOR with generic modules. 2016-11-10 08:59:23 -08:00
4d03d96e8b fix: cunn can't find cutorch sources
https://github.com/torch/distro/issues/138#issuecomment-259133935
2016-11-10 14:44:46 +03:00
a39ffebc3a Add THCTensor_(sizeDesc) for better debug messages. 2016-11-09 12:09:18 -08:00
4bba6082ed [cutorch rand2gen] extend functions to use _double methods 2016-11-09 11:55:51 -08:00
b111632965 [cutorch rand2gen] move randn to generic 2016-11-09 11:09:30 -08:00
0a34b34bfe [cutorch rand2gen] partial move of logNormal to generic, needs further debugging 2016-11-09 10:55:54 -08:00
6b821ece22 fixing trainer tests (#213) 2016-11-08 21:50:17 -05:00
d3b2096bfd trainer fix for new optim API 2016-11-08 15:49:03 -08:00
9f1b12bf06 Merge pull request #1009 from gchanan/spatialNNGeneric
Support generic type Spatial modules
2016-11-08 18:17:58 -05:00
e64fca4b04 Allow wider test tolerances for:
1) Size of half numbers
2) Convolution weight/bias
3) BatchNormalization
2016-11-08 13:47:01 -08:00
b941e73f4f ArgCheck that dilation parameters are > 0 and ensure tests
pick dilation parameters > 0.
2016-11-08 13:46:52 -08:00
c57873d3cb Add generic support for LookupTable.
In some cases, does not do accumulation as accreal.
2016-11-08 13:46:48 -08:00
f3bc3275ac Add generic support for TemporalConvolution.
Has increased tolerance for backward weight/bias like other
Convolution modules.
2016-11-08 13:46:45 -08:00
8df26e6c5c Add generic support for VolumetricFullConvolution, VolumetricDilatedConvolution.
Has increased tolerance for backward weight/bias like other
Convolution modules.
2016-11-08 13:46:33 -08:00
5c8ecb8150 Fix one more compatibility bug in Python 3.3 2016-11-08 16:13:25 -05:00
3928f7740a Implement functional interface for Variables (torch.*) 2016-11-08 16:13:25 -05:00
1767f73e6b Add generic support for VolumetricConvolution.
Uses the higher tolerances for weight/bias that are used for
SpatialConvolution modules.
2016-11-08 13:07:35 -08:00
9e7d5e93ab Add generic support for VolumetricReplicationPadding. 2016-11-08 13:07:35 -08:00
70c6ee93a2 Add generic support for VolumetricAveragePooling. 2016-11-08 13:07:35 -08:00
5cbf8504ef Add generic support for VolumetricMaxPooling, VolumetricMaxUnpooling,
VolumetricDilatedMaxPooling.
2016-11-08 13:07:35 -08:00
9a393b023d Add generic support for TemporalMaxPooling. 2016-11-08 13:07:35 -08:00
30bf464f73 Rebase BatchNormalization. 2016-11-08 13:06:52 -08:00
9fb1f8934b Add support for L1Cost.
Changes thrust::reduce to trust::transform_reduce in order
to be able to do summation at accreal precision.
2016-11-08 13:01:06 -08:00
f3f02b23a0 Add generic support for SparseLinear.
We don't support SparseLInear with fp16 because of lack of cusparseHcsrmm
(or equivalent Ex function) until CUDA 8.0.
2016-11-08 13:01:06 -08:00
7668cdd32c Add generic support for DistKLDivCriterion. 2016-11-08 13:01:06 -08:00
f9dafdcf09 Add generic support for ClassNLLCriterion. 2016-11-08 13:01:06 -08:00
d284a419c1 Add generic support for BCECriterion.
Test skips comparing vs lua version for half type, because hdot is
not currently implemented in cutorch.
2016-11-08 13:01:06 -08:00
b45844e3d9 Add generic support for L1SmoothCriterion. 2016-11-08 13:01:06 -08:00
6caa7e0fff Add generic support for MultiLabelMarginCriterion. 2016-11-08 13:01:06 -08:00
1669fffb8d Add generic support for MultiMarginCriterion.
Accumulation is done at accreal precision and changes target tensor
indexing to THCIndexTensor.
2016-11-08 13:01:06 -08:00
18aa86eebd Add generic support for MSECriterion. 2016-11-08 13:01:06 -08:00
075e49d3f4 Add generic support for SoftMarginCriterion. 2016-11-08 13:01:06 -08:00
a6695b8365 Add generic support for MarginCriterion. 2016-11-08 13:01:06 -08:00
06ee48b391 Add generic support for AbsCriterion. 2016-11-08 13:01:06 -08:00
fcaeffbbd4 Fix spacing in SpatialDilatedMaxPooling. 2016-11-08 13:01:06 -08:00
6146a9a641 Generic support for SpatialFullConvolution and SpatialDilatedConvolution.
Uses matrix multiple for matrix vector multiply for half (no matrix vector
implementation exists).
2016-11-08 13:01:06 -08:00
83de8e40d5 Add generic support for SpatialFractionalMaxPooling. 2016-11-08 13:01:06 -08:00
30590c46a3 Generic support for SpatialConvolutionMM.
Still need Hgemv.
2016-11-08 13:01:06 -08:00
a3a5e56287 Add generic support for SpatialConvolutionLocal. 2016-11-08 13:01:06 -08:00
185c96d63a Add generic support for SpatialUpSamplingBilinear.
Math is done at accreal precision.  At real precision,
forward pass fails, but backward passes.  We do backward
pass at accreal precision for consistency.
2016-11-08 13:01:06 -08:00
be61ad6eb4 Add generic support for SpatialUpSamplingNearest.
Accumulates as AccType.
2016-11-08 13:01:06 -08:00
222dfd2259 Add generic support for SpatialReplicationPadding. 2016-11-08 13:01:06 -08:00
b06e1c7e1d Add generic support for SpatialReflectionPooling. 2016-11-08 13:01:06 -08:00
6876abba51 Add generic support for SpatialSubSampling.
Half types fail on backward, probably because we don't consistently
accumulate in accreal.  This is difficult because gradInput is
accumulated directly (either with atomicAdd or not) rather than
in another variable.
2016-11-08 13:01:06 -08:00
0798466a01 Generic support for SpatialCrossMapLRN
Removed the C-linkage for a couple of functions because they are now generic --
not sure if they were used by anyone outside.
2016-11-08 13:01:06 -08:00
2cda782273 Add generic support for SpatialAveragePooling. 2016-11-08 13:01:06 -08:00
7d1c9554b6 Add generic support for SpatialAdaptiveMaxPooling. 2016-11-08 13:01:06 -08:00
a29d16f1a8 Use THCIndexTensors more generally. 2016-11-08 13:01:06 -08:00
6d0c1c0f17 Use indices for SpatialAdaptiveMaxPooling indices. 2016-11-08 13:01:06 -08:00
5ed4b5c25b Add generic support for SpatialMaxUnpooling. 2016-11-08 13:01:05 -08:00
6fe89c5e44 Fix tests 2016-11-08 13:01:05 -08:00
fda8c37641 Add generic support for SpatialMaxPooling.
Also fix tests for SpatialDilatedMaxPooling.
2016-11-08 13:01:05 -08:00
6d5a0ff3a1 Get SpatialDilatedMaxPooling generic working with long tensors as index.
Does as much math as possible in accreal to try to suss out why CudaHalfTensor fails.
2016-11-08 13:01:05 -08:00
f8718dd355 Add generic support for SpatialDilatedMaxPooling. 2016-11-08 13:01:05 -08:00
85af686797 Add generic support for SpatialClassNLLCriterion. 2016-11-08 13:01:05 -08:00
0f6ec3f15f Remove fastExpIfAvail and benchmarking from functional tests.
Also fix broken IFNDEF and test whitespace.
2016-11-08 13:01:05 -08:00
44644c50ee Reorganize THCHalfAutoNumerics. 2016-11-08 13:01:05 -08:00
9749f7eacc Add generic support for RReLU. 2016-11-08 13:01:05 -08:00
d9a2bdb9df Add generic support for PReLU.
This is the first instance of functions that take a lua number but
are not reals in C.  So, instead of automatically converting lua
numbers in the half case, we parse the function definitions to
find the argument positions to convert.
2016-11-08 13:01:05 -08:00
57e678c94b fix logsoftmax 2016-11-08 13:01:05 -08:00
516f127cfd Add generic support for LogSoftMax. 2016-11-08 13:01:05 -08:00
e477add103 Add generic support for SoftMax.
Math is done at accreal precision (e.g. for half,
math is done at float precision).  Originally code
called __expf, which doesn't have a double equivalent;
we call exp instead of converting down.
2016-11-08 13:01:05 -08:00
ba3d577875 Add generic support for ELU. 2016-11-08 13:01:05 -08:00
917e4f47c4 Add generic support for SoftShrink. 2016-11-08 13:01:05 -08:00
0143dac247 Add generic support for Square.
Math is (arbitrarily?) done at double precision to
keep the intent of existing code.
2016-11-08 13:01:05 -08:00
d2390f3616 Add generic support for Sqrt. 2016-11-08 13:01:05 -08:00
949ea73402 Add generic support for LeakyReLU. 2016-11-08 13:01:05 -08:00
d1e2fe0efe Add generic support for Threshold. 2016-11-08 13:01:05 -08:00
584ada12bf Add generic support for LogSigmoid.
This has the same logic as Sigmoid; i.e.
math is done at double precision and then
stored back at desired precision.
2016-11-08 13:01:05 -08:00
3ead72f654 Add generic support for Sigmoid.
This maintains the existing logic of doing the math in
double precision and converting back to the intended
type (previously: just float).  We do the same for
half here, although perhaps we should do the math
at float in that case.

There is some question about what to do with conversions;
Sigmoid did math in double before converting back to float;
we keep this intent, although there is some question on whether
this was intentional and for half -- should we just go up to
float or up to double?
2016-11-08 13:01:05 -08:00
9ce96d3bd3 Add generic support for Abs. 2016-11-08 13:01:05 -08:00
5549c003d9 Add generic support for HardTanh. 2016-11-08 13:01:05 -08:00
46105bf90b Add generic support for Tanh. 2016-11-08 13:01:05 -08:00
73ce3b3702 Add generic support for SoftPlus.
Adds the ability to "genericize" cunn modules that can exist
simultaneously with non-generic modules (i.e. modules can
be genericized one at a time).  Allowing both generic and
non-generic modules simultaneously requires some extra code
that can be removed once every module is genericized.
Also genericizes SoftPlus in this way.
2016-11-08 13:01:05 -08:00
1c6225dc2f [cutorch rand2gen] move geometric to generic 2016-11-08 10:47:28 -08:00
44874542c8 fix printing in console (#208) 2016-11-08 13:42:26 -05:00
31f2846aff [cutorch rand2gen] move multinomial to generic 2016-11-08 09:34:19 -08:00
bc08011e72 Don't jongjmp out of omp loops in unpooling modules 2016-11-08 18:12:56 +01:00
7cccc216d0 ArgCheck that dilation parameters are > 0. 2016-11-08 18:12:56 +01:00
09493603f6 Change optimizer API 2016-11-08 18:12:56 +01:00
e799bd0ba9 Restrict in-place autograd ops to disjoint variables 2016-11-08 18:12:56 +01:00
40247b0382 Fix torch tests in Python 3.3 and 3.4 2016-11-08 18:12:56 +01:00
cd2e9c5119 [cutorch rand2gen] move cauchy to generic 2016-11-08 08:11:39 -08:00
0b6f7b12b1 [cutorch rand2gen] move exponential to generic 2016-11-08 08:04:26 -08:00
86e42ba291 Adding truncated tensor printing (#202)
* Adding truncated tensor printing
2016-11-08 10:05:30 -05:00
e0a18cafd3 Don't jongjmp out of omp loops in unpooling modules 2016-11-08 13:23:43 +01:00
8c2f77cab6 updated autogen docs 2016-11-07 17:19:00 -05:00
c1bd6ba1e1 Zero-initialize outputs for BLAS functions 2016-11-07 22:50:56 +01:00
df59b89fbb Add more optimizers 2016-11-07 22:50:56 +01:00
8fd9cc160c [cutorch rand2gen] move normal to generic 2016-11-07 13:26:59 -08:00
28e3f07b63 adding apply function 2016-11-07 16:17:49 -05:00
513d902df1 adding __repr__ for nn 2016-11-07 16:17:40 -05:00
fce14a9f51 [cutorch rand2gen] move bernoulli to generic 2016-11-07 13:16:10 -08:00
884107da01 [cutorch rand2gen] move uniform, rand to generic 2016-11-07 12:27:30 -08:00
caa79a354a [cutorch rand2gen] make sampleMultinomialWithRoutReplacement utility function generic 2016-11-07 10:33:03 -08:00
5bb873a2fe [cutorch rand2gen] make sampleMultinomialWithReplacement utility function generic 2016-11-07 10:28:19 -08:00
bc0442d7df [cutorch rand2gen] make sampleMultinomialOnce utility function generic 2016-11-07 10:15:13 -08:00
cfcd33552b [cutorch rand2gen] make renormRowsL1 utility function generic 2016-11-07 10:02:21 -08:00
5f6b9fd5ba [cutorch rand2gen] introduce THCTensorRandom.cuh, move and templatize simple binary search function 2016-11-07 08:31:19 -08:00
469dce4a2d skip test_scatter_gpu on no CUDA 2016-11-05 20:10:07 -04:00
55d32de331 Fix bugs in torch.legacy.nn and add regression tests 2016-11-05 22:48:52 +01:00
4491d2d3cb Expose ger, mv, mm, bmm as tensor methods 2016-11-05 22:48:52 +01:00
f9669b9b9a Merge pull request #583 from nicolasvasilache/master
THC UVA Allocator
2016-11-05 11:50:07 -04:00
246d5f37c7 THC UVA Allocator 2016-11-05 02:40:44 +00:00
293bfb03dd Merge commit '4def4e696b9079f587d0dba3e86423df5ea429b8' 2016-11-03 14:12:22 -07:00
4def4e696b fix result type 2016-11-03 14:10:49 -07:00
b6e58c030a enable dot for CUDA_HALF 2016-11-03 13:50:50 -07:00
bf00308ab2 Merge commit 'fd677945741b4ee353079911993ada3770e07f5c' 2016-11-03 13:31:12 -07:00
e3e786e35e Move source code checks from __getstate__ to torch.load (#200)
The __getstate__ and __setstate__ functions are called from copy.copy as
well as pickling. The source code inspection currently slows down the
data parallel code because it makes a copy of the object every
iteration.
2016-11-03 16:29:14 -04:00
fd67794574 Merge pull request #581 from torch/dotfix
making dot to have an accreal return type (consistent with CPU)
2016-11-03 12:51:27 -04:00
104b502919 ArgCheck that dilation parameters are > 0. 2016-11-03 09:02:22 -07:00
a18cd3ba92 ArgCheck that dilation parameters are > 0. 2016-11-03 09:01:43 -07:00
0676cad200 Merge commit 'e644f6ed2c1965b0de55cc9037d5c75245f63d54' 2016-11-03 08:36:42 -07:00
3b1d217310 Merge commit 'e32af0196e10ad11b3938ad73ec5ef49cac7c03e' 2016-11-03 08:36:04 -07:00
93bcb2e7ba making dot to have an accreal return type (consistent with CPU) 2016-11-02 16:40:54 -07:00
ebc70f7919 Look for libcudart in default CUDA installation paths (#195) 2016-11-02 19:36:10 -04:00
e32af0196e Merge pull request #828 from apaszke/lapack
Add more size checks and improve some LAPACK error messages
2016-11-02 18:53:45 -04:00
3e5c121c56 Adding !!inc to cwrap and splitting up TensorMethods.cwrap (#197)
* Adding !!inc to cwrap and splitting up TensorMethods.cwrap
2016-11-02 18:50:56 -04:00
e644f6ed2c Add supporting code for CUDA IPC
This adds three small pieces to help with sharing THCStorages across
processes:

 1. THCIpcAllocator: a THCDeviceAllocator to close shared memory handles in the
    child process.
 2. THCCachingAllocator_getBaseAllocation which returns the pointer and
    size of the underlying cudaMalloc allocation. This is necessary
    because cudaIpcGetMemHandle requires 'base' pointers
 3. Support for TH_STORAGE_VIEW in THCStorage_(free). This is useful in
    child processes to represent THCCachingAllocator allocations split
    from a larger cudaMalloc call.
2016-11-02 14:53:28 -07:00
551a7c72f3 Fix multiprocess serialization with "spawn" or "forksever" (#198) 2016-11-02 17:44:36 -04:00
05b121841e Add more size checks and improve some LAPACK error messages 2016-11-02 21:51:51 +01:00
c29aea89ee Merge pull request #827 from howard0su/freebsd
Fix compile error on freebsd
2016-11-02 16:10:50 -04:00
103e70ccc5 adding cuda types for tensor methods (#194) 2016-11-02 10:25:58 -04:00
ec7ecbe2dd Fix compile error on freebsd 2016-11-02 20:27:05 +08:00
7a06dbb87e Merge commit '1234e434fa2b6ddd440194c8bccd352593902c69' 2016-11-01 21:33:41 -07:00
1234e434fa TH_INDEX_BASE for nonzero 2016-11-01 21:08:52 -07:00
2d374f982e Changes for ccache nvcc support 2016-11-01 15:54:33 -04:00
4e73630a95 Fix criterion backward, that was modifying grad_output shape 2016-11-01 19:31:53 +01:00
e867baa5f9 Accept file paths in torch.save and torch.load 2016-11-01 19:31:53 +01:00
04b750cb52 Improve Parameter's __repr__ 2016-11-01 19:31:53 +01:00
97c7b12542 Fix Variable __setstate__ refcounting bugs 2016-11-01 19:31:53 +01:00
0dfec752a3 Merge commit 'f16f68e103dfc22921f6106ec7136ddc7a0ab087' 2016-11-01 10:38:13 -07:00
f16f68e103 CMake: Install generic/THCTensorMathScan.h 2016-11-01 16:07:07 +01:00
4b7f8f9b77 adding notes for compiling from source 2016-11-01 01:27:28 -04:00
9969d50833 fix for CPU-only builds 2016-11-01 01:19:37 -04:00
7355c63845 adding multiple types for dist 2016-10-31 21:26:19 -07:00
16cac6442a adding multiple types for cumsum, cumprod 2016-10-31 21:26:19 -07:00
5009ae5548 adding multiple types for pow, trace, diag, tril, triu 2016-10-31 19:26:08 -07:00
32647e285e implement torch.nonzero 2016-10-31 18:22:49 -07:00
6df334ea68 Improve potrf error message. (#189) 2016-10-31 18:48:29 -04:00
f8501042c1 Make _requires_grad Variable attribute writeable 2016-10-31 22:47:09 +01:00
be085b8f6c Allow marking non-leaf variables as non-requiring grad 2016-10-31 22:47:09 +01:00
ef557761dd Allow to not use all function outputs in autograd 2016-10-31 22:47:09 +01:00
15377ac391 Copy Module._buffers in nn.parallel.replicate (#180) 2016-10-31 12:12:29 -04:00
ad5fdef6ac Make every user-visible Tensor have a Storage (#179) 2016-10-31 12:12:22 -04:00
0cb5943be8 Fix NCCL reduce_scatter in Python 2.7 (#183) 2016-10-30 17:58:02 -04:00
fb593d5f28 Fix bugs in variable __setitem__ and improve __getitem__ 2016-10-30 00:16:06 +02:00
645c913e4f Print GPU id for CUDA tensors 2016-10-30 00:16:06 +02:00
b4f4cca875 Rename training and evaluation methods 2016-10-30 00:16:06 +02:00
6027513574 Add support for indexing with numpy types 2016-10-30 00:16:06 +02:00
849188fdab Fix multiprocessing 2016-10-29 14:23:23 -07:00
a9c14a5306 Remove unused code 2016-10-28 15:28:22 -07:00
2da36a14d1 Clean up cuDNN code and fix chooseBackwardFilterAlgorithm 2016-10-28 13:05:53 -07:00
2ee451f5f7 Build in Release mode 2016-10-28 12:51:19 -07:00
f2d7e94948 Use torch.Size for Tensor sizes and tuple for strides
See issue #20

The torch.Size class is a tuple subclass which distinguishes sizes from
other tuples so that torch.Tensor(size) is interpreted as size instead
of data.
2016-10-28 19:37:09 +02:00
2031dfc08a Add hdot support for CUDA 8.
If not compiled with CUDA 8+, an error is raised indicating that
CUDA 8.0+ is required.
2016-10-27 15:01:09 -07:00
34ede14877 Fix compile error due to THCStorage change 2016-10-27 14:27:10 -07:00
2af3098e5a Merge commit '42e835ebb81a3ecf8f76e15bb1866c1427f61d74' 2016-10-27 13:49:23 -07:00
2e44511b13 Merge commit 'bbe8627a3f0e6cbb8fd1952826f75df741e44b01' 2016-10-27 13:47:36 -07:00
7bc4aa7e72 Merge commit '2bd36604e298547cc66f175588c925271223b4e9' 2016-10-27 13:46:38 -07:00
e2458bce97 Add Parameter class to nn 2016-10-27 22:31:36 +02:00
ae9789fccc adding input / output / member sections to the docgen 2016-10-27 01:11:53 -04:00
45ef25ea27 fix rnn documentation typos and format 2016-10-27 01:11:53 -04:00
ad2d413c0b Add C++ bindings for cuDNN (#167)
The Python ctypes bindings overhead was high enough that it slowed down
multi-gpu training when using 4+ Maxwell GPUs.
2016-10-26 19:51:48 -04:00
30924ff1e0 Fix test_nonzero flakiness (#173) 2016-10-26 19:50:56 -04:00
383c48968f Add support for indexing with ellipsis (#172) 2016-10-26 19:50:44 -04:00
bbe8627a3f Use 'void' for no-arg functions 2016-10-26 12:44:34 -07:00
2bd36604e2 Fix no-arg function prototypes 2016-10-26 12:35:05 -07:00
9ed47ef531 fix bug in mmaping 2016-10-26 07:23:04 -07:00
139f98a872 pushing THCState back to the header 2016-10-25 18:23:53 -07:00
c825895190 Make KwargsPlugin output deterministic 2016-10-26 00:19:33 +02:00
42e835ebb8 Add sameGPU checks to BatchNormalization (#361) 2016-10-25 15:19:03 -04:00
a7d5fdf54e Add integer indexing for MultiLabelMarginCriterion. 2016-10-25 11:42:56 -07:00
3b4e41f6ec Add integer indexing for MultiMarginCriterion. 2016-10-25 10:19:53 -07:00
5505e1de7d Store the device in THCStorage 2016-10-25 07:21:54 -07:00
6d329e418b allocator updates 2016-10-25 07:07:52 -07:00
3a11afb57f some bugfixes for THC 2016-10-24 17:16:17 -07:00
df86e02c9e update nn docs 2016-10-24 17:20:00 -04:00
deebc1383e Show exponent when printing vectors 2016-10-24 22:30:11 +02:00
19f2f1a9d3 Buffer values when constructing a CUDA tensor from a sequence 2016-10-24 22:30:11 +02:00
4dc13ecdd8 Make tests deterministic 2016-10-24 22:30:11 +02:00
b4b6e356ef Fix clang warnings 2016-10-24 22:30:11 +02:00
9000f40e61 Add torch.from_numpy 2016-10-24 22:30:11 +02:00
f137c0c05a Improve error messages of stateless functions 2016-10-24 22:29:43 +02:00
b43a02a9aa Make random 0-based 2016-10-24 22:29:43 +02:00
30be715900 Add training and evaluation to torch.nn 2016-10-24 22:29:43 +02:00
71cf8e14cb Fixes in torch.legacy.nn 2016-10-24 22:29:43 +02:00
ffd4863b23 Don't build nccl on macOS 2016-10-24 22:29:43 +02:00
4c17098bb8 Fix platform detection in torch.cuda 2016-10-24 22:29:43 +02:00
bcfdd18599 Fix python2.7 compatibility and check cffi version in ffi utils 2016-10-24 22:29:43 +02:00
067662d280 making .numpy return writeable arrays (#164) 2016-10-24 16:23:28 -04:00
93d02e4686 Merge pull request #129 from adamlerer/cudnn_rnn
CuDNN + PyTorch RNN library
2016-10-24 15:00:02 -04:00
12de115305 Fix Lua->Python logic in legacy.optim 2016-10-24 20:04:23 +02:00
b5d13296c6 addressing comments 2016-10-23 21:11:22 -07:00
86288265ad Adding rnn cell library 2016-10-23 20:23:48 -07:00
a559d94a44 docs and such 2016-10-23 20:23:48 -07:00
1eb6870853 add nobias option to rnn 2016-10-23 20:23:48 -07:00
f88c3e9c12 fix some missing features in pytorch needed for RNNs 2016-10-23 20:23:48 -07:00
942ca477a6 Copying weights for CUDNN 2016-10-23 20:23:48 -07:00
b0e33fb473 cudnn + THNN match with parameters 2016-10-23 20:23:48 -07:00
d58b627b98 CUDNN RNN bindings 2016-10-23 20:23:48 -07:00
b85fc35f9a Fix for versions compiled without CUDA support (#155)
* Fix pytorch when compiling without CUDA support
* Skip print test with CUDA types if CUDA is not available
2016-10-23 13:03:10 +02:00
bcb466fb76 fix bug with numpy conversion and storageOffset > 0 (#154) 2016-10-22 11:56:18 -04:00
6db721b5dd Make DataLoader preserve the ordering of the dataset (#135) 2016-10-21 23:54:16 -04:00
140c65e52b fixing python setup.py clean 2016-10-21 23:20:02 -04:00
29e8d77ce0 Merge pull request #558 from gchanan/genericDeviceTensorUtils
Add generic type support for toDeviceTensor.
2016-10-19 18:19:13 -04:00
b66a4ea919 Add THNN_CHECK_DIM_SIZE_INDICES to avoid pointer conversion warnings. 2016-10-19 15:01:49 -07:00
d3d59e5024 Indices for nn. 2016-10-19 14:53:19 -07:00
5285da0418 Use index types for SpatialAdaptiveMaxPooling indices. 2016-10-19 14:53:10 -07:00
a76e69d709 Use index types for Max Pooling / Unpooling indices. 2016-10-19 14:52:58 -07:00
4d0d775d16 Add generic type support for toDeviceTensor. 2016-10-19 14:36:03 -07:00
98f67e90d5 Fix super call in Container.modules and Container.parameters (#142) 2016-10-19 13:21:03 -04:00
fee67c2e1a Allow parameters and child modules to be assigned by attribute (#136)
For example:
  self.linear = nn.Linear(10, 20)
  self.weight = torch.autograd.Variable(torch.Tensor(10, 20))
2016-10-18 23:34:20 +02:00
c295f26a00 Support async argument to Variable.cuda (#137) 2016-10-18 23:27:11 +02:00
8a09c45f28 Fix typo 2016-10-18 09:29:19 -07:00
79ead42ade Add CUDA Stream and Event API (#133) 2016-10-18 12:15:57 -04:00
94e52e1d17 Fix Variable.cat 2016-10-17 15:36:08 -07:00
3931beee81 Use THSetNumThreads instead of omp_set_num_threads
Set OMP num threads to one in the data loader.

Fixes #81
Fixes #82
2016-10-17 15:15:00 -04:00
d293c17d21 Merge commit '1a3920e5dc546803ec8ada369ff1b0d56cf24e76' 2016-10-17 10:29:41 -07:00
1a3920e5dc Expose OpenMP num threads through TH lib
Expose omp_set_num_threads and similar APIs through the TH lib. This
means a third-party libaries using TH don't need to be compiled with
OpenMP support just to control the number of TH OMP threads.
2016-10-17 10:09:10 -07:00
ffc3eb1a24 Exclude THNN Linear in favor of Python implementation 2016-10-17 09:53:20 -07:00
2f5d4a7318 gcc 5 + cuda < 8 workaround improved 2016-10-17 12:46:21 -04:00
70553f4253 gcc 5 + cuda < 8 workaround improved 2016-10-17 12:45:45 -04:00
8d39fb4094 Use new THC API for device allocator 2016-10-17 09:35:41 -07:00
7d10b2370f Merge commit 'ec7a2878013ec70a4d4a8bfb6f5e5503f87f9ea0' 2016-10-17 09:35:04 -07:00
31ec7650ac Merge commit '429f2d67652f4fcba0bbf65c7d3e109e136a9cdf' 2016-10-17 09:33:06 -07:00
c014920dc1 Merge commit 'b01c78580594c53e6afb02b3d2110577a4673308' 2016-10-17 09:32:01 -07:00
17e3d4e1ee Merge commit '38cb3d02270b9e558a891a9a2bef01a75d1bd9e1' 2016-10-17 09:31:38 -07:00
b01c785805 Fix cutorch.getStream()
state->numUserStreams does not include the NULL stream, which is stored
in res->streams[i]
2016-10-17 08:49:23 -07:00
0eea71f878 torch.cat for multiple cuda types 2016-10-17 01:56:33 -04:00
ec7a287801 Merge pull request #1006 from torch/errorsimprovements
more improvments on error messages and shape checks
2016-10-17 00:46:21 -04:00
4bc585a2fe more improvments on error messages and shape checks 2016-10-17 00:37:50 -04:00
429f2d6765 fixes to upsampling bilinear API 2016-10-17 00:30:25 -04:00
a0c7e3cf04 Merge pull request #550 from colesbury/streams
Add stream API that is not based on indices
2016-10-16 19:08:03 -04:00
9cd68129da fixing typo 2016-10-16 19:07:09 -04:00
aa6f6117b7 Ported Linear module to THNN 2016-10-16 17:49:47 +02:00
6fa9c87aa4 Merge pull request #548 from BTNC/win-msvc
make cunn compile with msvc && fix compilation failure for linux/mac os
2016-10-15 22:07:52 -04:00
ee14cf9438 Add support for pinned memory: (#127)
torch.Storage/Tensor.pin_memory()
 torch.Storage/Tensor.is_pinned()
2016-10-15 18:38:26 -04:00
0391bbb376 Fix view_as and view for empty tensors (#128) 2016-10-15 18:33:05 -04:00
28ada0c634 update md docs 2016-10-14 18:56:24 -04:00
2c233d23ad Add stream API that is not based on indices
This implements the THC code so that we can expose streams as objects
instead of simply referring to them by indices. This is not exposed in
Lua yet.
2016-10-14 15:25:38 -07:00
59c628803a fixing padding_idx option 2016-10-14 15:05:21 -07:00
6b830bc77f Merge pull request #78 from colesbury/nccl
Use NCCL in comm.py if available
2016-10-14 17:44:11 -04:00
f30081a313 Use NCCL bcast and reduce functions in comm 2016-10-14 14:16:32 -07:00
c15648c6b5 Add NCCL build scripts 2016-10-14 14:16:32 -07:00
a02917f502 Fix typo 2016-10-14 14:07:29 -07:00
70d8bd04c0 Make cuDNN descriptors extend object
Fixes weird double __del__ issue
2016-10-14 13:58:20 -07:00
ad2cee0cae Fix caching allocator when used from multiple Lua threads
Use a single, global THCCachingAllocator instance.

Previously, each Lua thread had its own THCCachingAllocator instance.
However, threads can share storages, which means a segment could be
allocated from on THCCachingAllocator and freed on another, which
breaks.

Fixes #539
2016-10-14 10:08:56 -07:00
756a7122ad torchdoc 2016-10-14 04:18:10 -04:00
3d6ebde756 qr and ormqr tests and bugfix 2016-10-14 03:10:16 -04:00
daa30aa992 fix typo 2016-10-13 23:11:32 -07:00
39459eb238 make cunn compile with msvc && fix compilation failure for linux/mac os 2016-10-14 12:54:00 +08:00
0325e2f646 Major autograd refactor
Improves autograd performance by more than 2x and fixes a couple
of bugs. All core functions have been moved to C.
2016-10-13 17:17:49 -07:00
93b8b5631f Improve CUDA tensor constructor speed 2016-10-13 17:16:39 -07:00
60ab1ce0c1 Stop using contextlib for device and device_of 2016-10-13 17:16:39 -07:00
2f186df52d removing CUDA_HALF_INSTRUCTIONS and enabling hgemm only for P100 2016-10-13 16:52:40 -07:00
452e07d432 Revert "change to work on windows && replace long with ptrdiff_t" 2016-10-13 18:09:34 -04:00
05d1404b9c Revert "changes to make cunn compile on windows with msvc" 2016-10-13 18:08:56 -04:00
534b9a1697 Bump to 1.3.1 2016-10-13 10:33:05 -07:00
b2781d0501 Fix primitives function prototype 2016-10-13 10:32:42 -07:00
bf7d1514f7 NVML (libwrap) : import the needed definitions 2016-10-13 10:28:59 -07:00
2acee24332 Add keyword argument support to most tensor functions 2016-10-13 12:32:04 -04:00
e7639e55f8 change to work on windows && replace long with ptrdiff_t 2016-10-13 23:44:28 +08:00
f978eca477 change to work on windows && replace long with ptrdiff_t 2016-10-13 22:55:58 +08:00
eb3ac2b367 changes to make cunn compile on windows with msvc 2016-10-13 22:22:23 +08:00
968d386b36 Make atomicAdd functions static inline. 2016-10-12 15:18:30 -07:00
38cb3d0227 Fix build when NEON is supported 2016-10-12 12:51:22 +00:00
6f606dd5f9 updating nn docs 2016-10-11 14:41:25 -04:00
bab616cf11 Fix OOM error message in tensor constructor 2016-10-10 20:51:15 -07:00
966adc6291 Simplify torch.cat 2016-10-10 20:51:15 -07:00
518cb6ec7c Allow specifying output size in MaxUnpooling 2016-10-10 20:51:15 -07:00
34bcd4c237 Rename FullConv to ConvTranspose and allow specifying output size 2016-10-10 20:51:15 -07:00
a121127082 Merge remote-tracking branch 'upstream/master' into more-generic-functions 2016-10-10 10:09:43 -07:00
50326e94b1 try cudnn 5.1.5 and 5.1.3 in that order to load them up. This is needed because cudnn for cuda 7.5 ships with 5.1.3 and cudnn for cuda 8.0 ships with 5.1.5 2016-10-09 22:26:43 -04:00
160723b5b4 fix cudnn lib name 2016-10-09 21:19:50 -04:00
7991125293 Improve error messages 2016-10-08 20:37:40 -07:00
96f61bff30 Add LAPACK functions 2016-10-08 20:37:37 -07:00
a94488f584 replace long with ptrdiff_t for memory size/offset, element count 2016-10-08 21:39:16 +08:00
f2cf673d3a fix tensor printing when the tensor is a view into a giant storage 2016-10-07 17:53:37 -04:00
c4595a3dd6 [cutorch refactor] addcmul/addcdiv to generic 2016-10-07 13:09:05 -07:00
5db118e64b Update LogSoftMax to work in spatial domain 2016-10-07 16:08:39 -04:00
8bb06c94be Improved allreduce segmentation for small sizes 2016-10-07 12:42:23 -07:00
1620c56808 [cutorch refactor] cmin/cmax to generic 2016-10-07 11:50:28 -07:00
e88e0026b1 [cutorch refactor] make dist(...)'s op generic, add missing unit test 2016-10-07 11:50:28 -07:00
ace9b49e28 [cutorch refactor] move cross(...) to generic 2016-10-07 11:50:28 -07:00
da90751add [cutorch refactor] move lerp(...) to generic 2016-10-07 11:50:28 -07:00
8cc566f7b5 [cutorch refactor] move clamp(...) to generic 2016-10-07 11:50:28 -07:00
02ad199905 [cutorch refactor] make var(...) generic 2016-10-07 11:50:28 -07:00
c3e0811d86 [cutorch refactor] cleanup code in prep for review 2016-10-07 11:50:28 -07:00
499d1c5709 [cutorch refactor] fixes for norm, wrap/test 2016-10-07 11:50:28 -07:00
cf16ec45e1 [cutorch refactor] move stdall into generic, wrap test for std 2016-10-07 11:50:27 -07:00
daa15dcceb [cutorch refactor] move varall into generic 2016-10-07 11:50:27 -07:00
32556cbe5e [cutorch refactor] move normall to generic 2016-10-07 11:50:27 -07:00
74d9c674f5 Make _norm(...)'s ops generic 2016-10-07 11:50:27 -07:00
a4da558fa0 [cutorch refactor] move mean function into generic/ 2016-10-07 11:50:27 -07:00
dba6d1d57f Make _norm(...)'s ops generic 2016-10-07 11:50:27 -07:00
b01c4338c9 [cutorch refactor] move std function into generic 2016-10-07 11:50:27 -07:00
811d947da3 [cutorch refactor] move renorm function into generic 2016-10-07 11:50:27 -07:00
de7bf7efe6 [cutorch refactor] move std function into generic 2016-10-07 11:50:27 -07:00
5537df9927 [cutorch refactor] make _renorm(...)'s ops generic 2016-10-07 11:50:27 -07:00
81fea93741 [cutorch refactor] move std function into generic 2016-10-07 11:50:27 -07:00
df1065a2d8 Move _std dependencies into THCTensorMathReduce.cuh 2016-10-07 11:50:27 -07:00
c2e3bf2145 [cutorch refactor] move meanall function into generic/, update cwrap for lua mean 2016-10-07 11:49:33 -07:00
a4d849ef68 [cutorch refactor] move mean function into generic/ 2016-10-07 11:49:33 -07:00
957c9f3853 Move atomicAdd functions to THCAtomics.cuh in order to share
definitions with other projects, e.g. cunn.
2016-10-07 11:43:02 -07:00
3958b6b0e1 Merge pull request #338 from nitsky/spatial_logsoftmax
SpatialLogSoftMax
2016-10-07 10:36:40 -04:00
5d70feb573 bug fix for wrong usage of checkGPU && port to windows with msvc 2016-10-07 15:55:38 +08:00
a22af69335 Add versioning and shared storage handling to autograd (#105) 2016-10-06 17:12:58 -04:00
1213149a2f add bias option to linear; allow modules to return nested lists/tuples of tensors (#106)
* add bias option to linear; allow modules to return nested lists/tuples of tensors
2016-10-06 15:59:12 -04:00
398b6f75cd update nn.md 2016-10-05 14:56:41 -04:00
e46e05e7c5 fix container doc 2016-10-05 14:53:41 -04:00
166028836d Ignore graph parts not requiring gradient in engine 2016-10-05 08:46:34 -07:00
3cbe66ba8c Change requires_grad default to False 2016-10-05 08:46:34 -07:00
99de537a2e Remove CUDA sync points from losses and trainer 2016-10-05 08:46:31 -07:00
1d0afdf9f7 Make requires_grad read only (except for leaves) 2016-10-05 07:55:07 -07:00
4db6667923 Allow specifying per-parameter optimization parameters 2016-10-04 18:21:50 -07:00
80e16e44aa Check container source on load 2016-10-04 17:41:12 -07:00
58b134b793 Allow exporting optimizer state as a dict 2016-10-04 17:33:49 -07:00
6efefac2df Add parameter_dict and load_parameter_dict methods for modules 2016-10-04 14:47:56 -07:00
0c9670ddf0 Allow remapping storages at load time and serialize data in little endian order 2016-10-04 12:54:55 -07:00
53c65ddc6a Fix memory leak when constructing a tensor from numpy (#98) 2016-10-03 23:27:54 -04:00
33371c5164 ffi tests skip on cuda 2016-10-03 12:15:28 -07:00
64dd1419c5 Fix Variable indexing bugs (#96) 2016-10-03 14:49:21 -04:00
108068a417 python 2.7 fixes 2016-10-03 00:14:06 -07:00
6e8ed95ada ‘fix compilation error: 'orr' loop initial declarations are only allowed in C99 mode 2016-10-03 14:11:59 +08:00
39c9f9e9e8 replace long with ptrdiff_t for memory size/offset etc 2016-10-03 12:55:30 +08:00
b555588f5d Make THNN lazy init thread safe 2016-10-02 21:36:05 -07:00
47ef4bb0a0 Fix memory leak in torch.cat 2016-10-02 21:36:05 -07:00
b34654bf97 Merge commit 'ab0e86ae4b0a08b8d0a67f1494ff80e65a6932ad' 2016-10-02 20:58:29 -07:00
6068df3ab2 Merge commit '60a8a9e918e04fd5581d20e4e7527dd115c69cd8' 2016-10-02 20:56:33 -07:00
bb35999f51 Merge commit '25c51c49aa3bb9ac5f64560a46f1f2a905f4e3f7' 2016-10-02 20:55:38 -07:00
25c51c49aa adding stdc++ static linking on TH_BINARY_BUILD=1 always, because caching allocator uses c++ 2016-10-02 20:48:35 -07:00
833bedb46b cudnn relative check in binary builds 2016-10-02 11:45:46 -07:00
3d8eba7b42 updating readme with new info 2016-10-02 10:13:15 -07:00
ab0e86ae4b fix arm neon bug 2016-10-02 08:35:40 -07:00
94b35312d0 Compile fixes for picky compilers / stl versions (#518)
* Compile fixes for picky compilers/stl versions
2016-10-02 00:41:47 -04:00
f4ebc65a12 Add Module.modules() and Module.children() (#90)
modules(): returns an iterator over all modules in the network
 children(): returns an iterator over immediate children

Also fix __getitem__ in Sequential
2016-10-01 21:18:53 -04:00
2bc9da4f5e Support "device" keyword argument (#79)
Adds the optional "device" keyword argument to Tensor and Storage
constructors and .new methods.
2016-10-01 19:32:55 -04:00
e034f258e3 Fix ffi utils in Python 2.7 2016-10-01 15:37:05 -07:00
39adf6dbd2 Merge pull request #80 from colesbury/data
Fixes to trainer and data loading
2016-10-01 16:50:42 -04:00
112df5f664 Fixes to trainer and data loading
1. Wrap target in a Variable in trainer
2. Collate numbers into torch.Long/DoubleTensors
2016-10-01 13:21:16 -07:00
3564b77553 a couple of changes for win32 (#779)
* windows timer with milliseconds
2016-10-01 15:27:30 -04:00
c813e93d85 fixing python 3 compat 2016-09-30 16:44:00 -07:00
ff59385034 Add 'torch/lib/nccl/' from commit 'ca330b110ae76ace344182ab83a028911111cc36'
git-subtree-dir: torch/lib/nccl
git-subtree-mainline: ea4f812a123a99d3beda1fdf4a2197035981eccb
git-subtree-split: ca330b110ae76ace344182ab83a028911111cc36
2016-09-30 16:35:16 -07:00
ea4f812a12 Fix Container.parameters() 2016-09-30 16:31:36 -07:00
dbe540e49f Use the custom TH error handler in all threads by default 2016-09-30 14:59:50 -07:00
c1c0969834 Allow changing the default error handler for all threads
THSetErrorHandler still modifies per-thread pointers, but
THSetDefaultErrorHandler allows to set a handler that's
used by all threads that haven't specified any function.
2016-09-30 14:59:50 -07:00
b87f26ce26 windows high resolution timer with a few makefile changes (#776)
windows high resolution timer
2016-09-30 14:59:50 -07:00
67335e638c bug fix for read/writeLong in THMemoryFile 2016-09-30 14:59:50 -07:00
90916f34a7 fix cpuid ecx; change to compile with msvc 2016-09-30 14:59:50 -07:00
11b38a6895 Add more functions to autograd 2016-09-30 16:37:07 -04:00
a1f5fe6a8f Add multiprocess data loader + improvements to torch.utils.data 2016-09-30 16:23:43 -04:00
5cad164dee Merge pull request #73 from colesbury/THC
Update THC and THCUNN
2016-09-30 15:53:11 -04:00
7dd28b885d Allow changing the default error handler for all threads
THSetErrorHandler still modifies per-thread pointers, but
THSetDefaultErrorHandler allows to set a handler that's
used by all threads that haven't specified any function.
2016-09-30 12:37:58 -07:00
c20828478e Update Module.cpp for THC changes 2016-09-30 11:13:14 -07:00
3e1c88e3e0 Merge commit 'da1e3f084d237ba319a22987f95f70abb69d7745' 2016-09-30 11:07:46 -07:00
e98a4ea336 Merge commit '0b0a62420c52b6e4d4c80c36d067db4654d1ed8d' 2016-09-30 11:06:53 -07:00
e8a5f00866 Auto GPU for CUNN (#71) 2016-09-30 14:04:53 -04:00
d92b7da733 fix documentation to not use forward 2016-09-30 09:49:30 -07:00
7ff16baa7d Use breadth-first in ExecutionEngine (#72) 2016-09-29 23:57:37 -04:00
93e60715af Fix error message 2016-09-29 16:27:20 -07:00
14965cfce9 Run cuDNN operations on the correct device 2016-09-29 16:27:07 -07:00
da1e3f084d Fixes for https://github.com/torch/cutorch/pull/519 2016-09-29 16:19:41 -07:00
0b0a62420c Make some basic THC operations thread-safe
Switching the device, setting the stream, and switching BLAS handles is
now thread-safe. Some other operations, like reserveStreams, are still
not thread-safe.
2016-09-29 16:17:43 -07:00
c92c82aa1a Really fix utils tests... 2016-09-29 12:52:12 -07:00
4742c08c7c Improve error messages in autograd 2016-09-29 12:16:19 -07:00
9c6ced1c0a Disable ffi tests if cffi is not available 2016-09-29 12:16:19 -07:00
a33c9bd774 Improve argument matching in invalidArguments 2016-09-29 12:16:19 -07:00
c8a4734b97 Add RReLU to both nn packages 2016-09-29 11:33:34 -07:00
3f7ab95890 Finish implementation of prng related functions 2016-09-29 11:33:25 -07:00
2d8c2972ae Only allow leaf variables as module parameters 2016-09-29 11:31:26 -07:00
941cf4e63d Add ffi utils for user C extensions 2016-09-29 09:35:56 -07:00
57610a7471 Fix documentation for MaxUnpool2d (#68) 2016-09-29 10:02:34 -04:00
f5a6a3b0e9 Fix torch.nn.Module._apply with None types (#66) 2016-09-28 19:31:07 -04:00
bab7f89cdc Fix no_bias constructor for conv2d (#65) 2016-09-28 19:30:43 -04:00
cb5d4e836f Lazy load CUDA and THNN modules (#64) 2016-09-28 19:29:53 -04:00
3a5544f060 Add support for GenerateFloatTypes, for use with cunn. 2016-09-28 09:59:19 -07:00
412019dbe4 fixing CPU builds by making cuda imports optional 2016-09-28 11:56:18 -04:00
f9d9c92560 Fix type conversions in autograd 2016-09-27 15:45:52 -07:00
7f4ff0e615 Fix type conversions in nn 2016-09-27 15:45:49 -07:00
3eac7164f4 Add data parallel functions to nn 2016-09-27 15:45:45 -07:00
f9d25e8e72 Refactor nn (require specifying parameters explicitly) 2016-09-27 15:22:26 -07:00
52ed57352a Free GIL in C functions 2016-09-27 15:22:20 -07:00
1828e7c42f Add async CUDA copy 2016-09-27 15:12:48 -07:00
2c89ae4e8a Rename getDevice to get_device 2016-09-27 15:12:48 -07:00
779a460030 Add cuDNN support for convolutions (#36) 2016-09-27 17:55:04 -04:00
0312f939d6 Only set c++11 compiler flags on THCCachingAllocator.cpp 2016-09-27 13:13:59 -07:00
60a8a9e918 improving error messages in nn 2016-09-27 12:26:03 -04:00
89666fc4fe Fix SpatialLogSoftMax memory leak and code cleanup 2016-09-27 08:16:31 -07:00
44527ab5be fix c++11 flags thing 2016-09-27 09:26:21 -04:00
a0cf6658c5 windows high resolution timer with a few makefile changes (#776)
windows high resolution timer
2016-09-27 08:59:27 -04:00
5107f23126 fix ClassNLLCriterion targets in tests and legacy nn 2016-09-26 18:56:12 -07:00
4a5557203b Merge commit 'c020a8502bd943aa37f897efe79a01fd61249ab4' 2016-09-26 17:54:05 -07:00
c020a8502b making ClassNLLCriterion targets consistent between cpu and cuda 2016-09-26 17:48:17 -07:00
44481354fc Add back support for child=None in Container constructor (#55)
It's often useful to have optional child modules, such as the
downsampling operation in ResNets. Add a test for this case:

  nn.Container(
    child=None,
  )
2016-09-26 17:18:02 -04:00
974fb1b09a Merge pull request #57 from colesbury/THC
Update THC and use CUDA caching allocator
2016-09-26 16:29:02 -04:00
4e9f0a8255 Use CUDA caching allocator 2016-09-26 13:12:39 -07:00
fa1f286cae Merge commit '85bd287b7ba481312fa58d7ffb32cba901c58829' 2016-09-26 13:08:32 -07:00
85bd287b7b Add THC_CACHING_ALLOCATOR=1 to README.md 2016-09-26 13:02:48 -07:00
0eff3897e3 Update SpatialLogSoftMax kernel to use cuda dimensions 2016-09-26 09:39:56 -07:00
e26e35a9ee bug fix for read/writeLong in THMemoryFile 2016-09-26 10:45:10 +08:00
980300b381 Combine autograd.Leaf and autograd.Variable (#52)
Prior to this change, there was a circular reference between Leaf and
Variable. This means that the objects (and referenced Tensors) are not
collected as soon as they go out of scope, which lead to higher memory
usage and out-of-memory errors.
2016-09-25 20:21:14 -04:00
1cf87e8a0b OSX + Python 2 build fixes 2016-09-25 19:26:13 -04:00
817d860af5 Add CUDA caching allocator
The allocator can be enabled by setting the environment variable
THC_CACHING_ALLOCATOR=1
2016-09-25 12:57:50 -07:00
0be5031a93 Pretty print type mismatches in error messages 2016-09-25 12:26:00 -07:00
1ed488da4f Make custom precision of CUDA tests work in inplace mode as well 2016-09-25 12:26:00 -07:00
ddf1598ef8 Add a method for catching exceptions thrown in ctypes 2016-09-25 12:25:54 -07:00
4a8a185aa4 Preserve storage view sharing in torch.save and torch.load 2016-09-25 12:24:10 -07:00
4cdeae3283 Return only unique variables from parameters() 2016-09-25 12:23:43 -07:00
5030d76acf Reduce precision of CUDA blas tests 2016-09-23 21:10:28 -07:00
c51e2c8b8c Rename CELoss to CrossEntropyLoss 2016-09-23 18:06:44 -07:00
eec0420eb3 Initialize nn modules' parameters with a default tensor type 2016-09-23 18:06:26 -07:00
e66ea56bb3 Improve THNN tensor type mismatch error messages 2016-09-23 18:06:26 -07:00
eefa0c7b40 Require torch.nn.cuda automatically when calling .cuda() 2016-09-23 18:06:26 -07:00
a489884da4 Reduce precision of addmm CUDA test 2016-09-23 17:52:08 -07:00
7a74d3fc9e Fix dl flag module in python>=3.6 2016-09-23 17:25:10 -07:00
e71204b52f Improve error messages in storage and tensor C functions 2016-09-23 17:17:35 -07:00
ca330b110a Add scan tests 2016-09-22 11:58:33 -07:00
6c77476cc1 Make tests check for deltas and report bandwidth 2016-09-22 11:58:28 -07:00
cabd6848e4 Heavy code refactoring to remove a lot of code in collectives (~1000 lines).
Have all collectives use the same args, the same ring, and the same primitives for synchronization between threads with the same pattern.
2016-09-22 11:57:56 -07:00
e3dbc6110e Add profiling API 2016-09-22 11:56:51 -07:00
1d6715fe20 Fix MPI test path 2016-09-22 11:56:20 -07:00
06ab3f962f Refactor _C extension to export some utilities 2016-09-21 08:36:54 -07:00
df77a8a81a Update LogSoftMax to work in spatial domain 2016-09-21 08:11:59 -07:00
94b7c32eb3 compiling double atomicAdd only if CUDA_ARCH < 6000, because it's now included in CUDA 2016-09-20 20:42:23 -04:00
8fdec15a55 Codemod to remove camel case method naming 2016-09-20 08:40:28 -07:00
e8b1217b28 Use bitwise operations for atomicAdd rather than byte_perm or pointer deferences.
Also properly check that half is enabled.
2016-09-19 14:00:52 -07:00
f56f06d88d fix cpuid ecx; change to compile with msvc 2016-09-19 14:41:48 +08:00
0f7a1e27d0 updating auto-generated docs 2016-09-19 00:39:46 -04:00
5114d94ad9 docstrings for conv, dropout, linear, pooling and sparse functions 2016-09-19 00:31:22 -04:00
f74c42bf00 Slightly improve THNN error messages 2016-09-18 15:02:25 -04:00
a8e816f450 Fix maskedSelect test 2016-09-18 12:54:12 -04:00
a90c259eda Add myself to LICENSE file 2016-09-18 12:53:57 -04:00
e223564a55 Fix multiprocessing on OS X 2016-09-16 18:27:07 -04:00
7847d77405 Add more functions to autograd 2016-09-16 15:26:24 -07:00
089d223922 Add support for CUDA indexAdd
Adds indexAdd via atomicAdd for unsigned char, char, short, long,
half, double.  Integer types are templatized based on sizeof.
Floating point types are implemented via intrinsics.
2016-09-16 12:50:57 -07:00
930085ec9c fixing doc2md for code blocks 2016-09-16 13:34:12 -04:00
e5874ea40d Add getDevice for CUDA storages 2016-09-15 13:54:39 -07:00
9ee6189bf9 Merge pull request #41 from jia-kai/master
Some minor fixes for compile/usage
2016-09-15 09:45:52 -07:00
939b0a4297 Merge pull request #45 from NVIDIA/cw-update-copyright-year
Update LICENSE.txt
2016-08-26 15:44:00 -07:00
234c8c9ef3 Update LICENSE.txt 2016-08-26 15:39:21 -07:00
75bad643bd Updated LICENCE.txt 2016-08-26 15:08:20 -07:00
47b0797fe1 pass devlist as const int* rather than int* in ncclCommInitAll 2016-08-19 19:00:14 +08:00
ed401cc29b link library with -lrt; otherwise there is undefined reference to shm_open 2016-08-19 18:58:56 +08:00
b3a9e1333d Remove unneeded deb build script 2016-07-27 17:58:00 -07:00
428ec5b2a3 Merge remote-tracking branch 'github/master' into public 2016-07-25 10:53:01 -07:00
55c42ad681 Fixed redundant contexts in multi-process apps
Change-Id: If787014450fd281304f0c7baf01d25963e40905d
2016-07-25 10:10:30 -07:00
7a1aa6b563 Improved Deb generation 2016-07-07 16:31:57 +02:00
9ae84f5d6b Fix version number 2016-06-16 17:07:42 -07:00
e51e922924 Add a debug level to NCCL and CUDA versions at init 2016-06-16 17:04:41 -07:00
9fcc523485 Increased version to 1.2.3 2016-06-15 19:18:13 -07:00
67d1ab9106 Packaging : Generate shlibs.local 2016-06-15 19:03:08 -07:00
da6d2009e0 Move deb to build directory 2016-06-15 18:20:10 -07:00
155132d336 Fix make install to use BUILDDIR 2016-06-15 18:20:02 -07:00
08ddfe03d2 Rework debian packaging 2016-06-15 18:18:44 -07:00
5d4716a8a3 Include link to blog post in README.md 2016-06-15 10:54:19 -07:00
aa8f669a3d Updating for .deb rebuild 2016-06-13 02:01:49 -07:00
d5e507fc7f Only call the CUDA runtime. That may fix #27. 2016-06-07 16:27:51 -07:00
620491a649 Merge remote-tracking branch 'github/master' into HEAD 2016-06-06 14:35:57 -07:00
7edfc57228 Make NCCL collectives work on communicators with only one rank 2016-06-06 14:35:00 -07:00
bd3cf73e6e Changed CURAND generator to work on a wider set of platforms. 2016-06-06 14:34:03 -07:00
177505b757 Gencodes changed to NV recommended 2016-06-06 00:06:18 -07:00
9d9d8cd59f Bump to 1.2.2 2016-06-03 17:21:53 -07:00
1657af1567 Better name for GENCODE 2016-06-03 10:25:37 -07:00
acb93d1aed Removing unneeded includes 2016-06-02 17:33:43 -07:00
889ad3d4e6 Makefile improvements
- Use standard CXX env var
 - Permit redefinition of more env
 - Separate lib from tests
2016-06-02 15:01:03 -07:00
93538def65 Merge pull request #22 from borisfom/master
Fixed version in ChangeLog
2016-04-21 18:58:44 -07:00
e5067b6611 Fixed version in ChangeLog 2016-04-21 16:28:13 -07:00
0629fb62d7 Merge pull request #21 from borisfom/master
Fixed install location, new .deb version
2016-04-21 14:46:41 -07:00
0177cf3ea4 Fixed install location, new .deb version 2016-04-21 14:10:31 -07:00
658aca1469 Merge pull request #17 from Hopobcn/master
Enable compilation with specific g++
2016-04-21 13:25:18 -07:00
03df4c7759 Moved no-as-needed flag to link rule.
Avoids link errors for tests linked with nvcc.
2016-04-19 14:51:03 -07:00
0d4f8f4e95 Merge pull request #18 from apaszke/master
Add --no-as-needed to make sure that cudart library gets linked
2016-04-19 11:11:39 -07:00
ddd3f2084d Fix readme to reflect the new test paths 2016-04-19 11:09:25 -07:00
dba3ec9428 Fix random deadlock during ncclCommInitRank. 2016-04-19 10:47:27 -07:00
9de361a1b9 Fix MPI test usage
Only display usage from rank 0 and exit instead of continuing (and seg fault).
2016-04-19 10:43:38 -07:00
c0c959b1be Add --no-as-needed to make sure that cudart library gets liked 2016-04-13 10:04:38 -04:00
e30bf95989 Enable compilation with old g++ when the default g++ is not supported (+5.0) 2016-04-12 12:49:13 +02:00
b16cc5d197 Merge pull request #16 from borisfom/master
Remved Tegra, fixed + format.
2016-03-17 17:35:04 -07:00
e6f4a83da6 Removing Tegra 2016-03-17 17:25:27 -07:00
1a8bae5b2f fixed version format 2016-03-17 17:13:45 -07:00
e8eb285a59 Merge pull request #15 from borisfom/master
Fixing version number and compile param for 5.3
2016-03-17 16:03:05 -07:00
b508d28123 Version with . 7.5 2016-03-17 15:48:48 -07:00
62b551798f Use arch=5.3 as well 2016-03-16 23:09:36 -07:00
dfbebe395c Delete libnccl1_1.1.1+cuda75_amd64.deb 2016-03-16 21:44:13 -07:00
85280b5bf4 Delete libnccl-dev_1.1.1+cuda75_amd64.deb 2016-03-16 21:44:04 -07:00
fb53cfd9b0 Added files via upload 2016-03-16 21:42:47 -07:00
92d2123d8d Added compute 5.3 2016-03-16 19:24:48 -07:00
ec3de28ae5 Preparing for pbuild 2016-03-16 19:23:49 -07:00
86dc136fa9 Moved to pbuilder 2016-03-16 18:41:54 -07:00
172f316ac2 Moved release files to proper area
Bumping a version; building for 7.5
2016-03-16 18:30:53 -07:00
941d9da08c Updated package version, added manpage 2016-02-29 12:10:34 -08:00
5554a4c9f0 Fixed useRemoteRecv consistency issue.
Change-Id: Ib093a8dc3bb093eddc89dad81d3fffa53c03a6a2
Reviewed-on: http://git-master/r/1013543
Reviewed-by: Cliff Woolley <jwoolley@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-02-18 13:45:42 -08:00
9442285526 Fixed buffer overflow in ReduceOrCopy
Bug caused AllGathers and ReduceScatters of less than
8 bytes to fail in certain cases.

Change-Id: I33e1beb50805bfdb457ae16a90e3f91c1b283b9b
Reviewed-on: http://git-master/r/1011505
Reviewed-by: Przemek Tredak <ptredak@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-02-12 15:13:56 -08:00
caa40b8dd3 Libwrap checks for LIB.so.1 if LIB.so not found
Change-Id: I6f07f887f828cb2259dcfd496a2ad707db898cf5
Reviewed-on: http://git-master/r/1000162
Reviewed-by: Przemek Tredak <ptredak@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-01-29 12:36:42 -08:00
2758353380 Added NCCL error checking to tests.
Also cleaned up makefile so that tests and lib are not built unnecessarily.

Change-Id: Ia0c596cc2213628de2f066be97615c09bb1bb262
Reviewed-on: http://git-master/r/999627
Reviewed-by: Przemek Tredak <ptredak@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-01-29 11:09:05 -08:00
fe1a956715 Enabled support for char type to be unsigned.
GCC on POWER arch defines char type as unsigned.

Change-Id: Ic143cb058fe42414b1f6f1f45b02132c837726ae
Reviewed-on: http://git-master/r/999614
Reviewed-by: Przemek Tredak <ptredak@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-01-28 13:38:18 -08:00
c05312f151 Moved tests to separate dir and improved MPI test
test sources moved to test/ directory.
MPI test displays PASS/FAIL and returns code accordingly.

Change-Id: I058ebd1bd5202d8f38cc9787898b2480100c102b
Reviewed-on: http://git-master/r/936086
Reviewed-by: Przemek Tredak <ptredak@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-01-28 12:56:36 -08:00
5966316771 Added support for more than 8 GPUs.
Change-Id: Iaa1841036a7bfdad6ebec99fed0adcd2bbe6ffad
Reviewed-on: http://git-master/r/935459
Reviewed-by: Cliff Woolley <jwoolley@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-01-21 13:00:21 -08:00
130ee246e2 Fixed deadlock in back-to-back reduce_scatters.
Change-Id: I92d32b15e516a39710b676aee692ae9b70638937
Reviewed-on: http://git-master/r/935458
Reviewed-by: Przemek Tredak <ptredak@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
2016-01-21 10:36:03 -08:00
90af7c73ef Merge pull request #6 from lukeyeager/deb
Deb packaging
2016-01-07 13:06:28 -08:00
3251681207 Merge branch 'yangky11-patch-1' 2016-01-06 16:48:29 -08:00
d332c41e71 fix a typo in README.md 2015-12-24 00:01:02 +08:00
c9da89254b Update deb packaging scripts 2015-12-18 14:23:34 -08:00
eb2d869f71 Merge pull request #5 from lukeyeager/tests-nvml
Don't link tests with NVML
2015-12-18 13:36:20 -08:00
f1e92fe2a3 Added Debian packaging files 2015-12-18 13:36:10 -08:00
b5400c54df Don't link tests with NVML 2015-12-18 13:27:55 -08:00
a4de6016f8 Merge pull request #4 from lukeyeager/build-sm50
Build SM 5.0 code
2015-12-18 13:23:48 -08:00
4807909e3f Merge pull request #3 from lukeyeager/semver
Use semantic versioning
2015-12-18 13:22:19 -08:00
dd0884b707 Build SM 5.0 code 2015-12-18 13:19:50 -08:00
e1634ca6cb Use semantic versioning 2015-12-18 12:02:17 -08:00
651a6edc5c Fixed bug in MPI initialization. 2015-12-10 17:54:41 -08:00
ada5edce88 Merge pull request #1 from slayton58/int64_uint64
Add int64 and uint64 types for all algorithms and tests
2015-12-10 17:22:50 -08:00
41ce4ca9fc Add int64 and uint64 types for all algorithms and tests 2015-12-04 13:28:36 -05:00
27d32ac5d9 Fixed a race condition in reduce and braodcast. 2015-11-19 11:11:52 -08:00
0673d5f44f Initial release. 2015-11-17 11:30:40 -08:00
1479 changed files with 172708 additions and 38183 deletions

33
.gitignore vendored
View File

@ -2,17 +2,50 @@ build/
dist/
torch.egg-info/
*/**/__pycache__
torch/version.py
torch/csrc/generic/TensorMethods.cpp
torch/lib/*.so*
torch/lib/*.a*
torch/lib/*.dylib*
torch/lib/*.h
torch/lib/build
torch/lib/tmp_install
torch/lib/include
torch/lib/torch_shm_manager
torch/csrc/cudnn/cuDNN.cpp
torch/csrc/nn/THNN.cwrap
torch/csrc/nn/THNN.cpp
torch/csrc/nn/THCUNN.cwrap
torch/csrc/nn/THCUNN.cpp
torch/csrc/nn/THNN_generic.cwrap
torch/csrc/nn/THNN_generic.cpp
torch/csrc/nn/THNN_generic.h
torch/csrc/generated
docs/src/**/*
test/data/legacy_modules.t7
test/data/gpu_tensors.pt
test/htmlcov
test/.coverage
*/*.pyc
*/**/*.pyc
*/**/**/*.pyc
*/**/**/**/*.pyc
*/**/**/**/**/*.pyc
*/*.so*
*/**/*.so*
*/**/*.dylib*
test/data/legacy_serialized.pt
test/data/linear.pt
# IPython notebook checkpoints
.ipynb_checkpoints
# Editor temporaries
*.swn
*.swo
*.swp
*~
# OSX dir files
.DS_Store

View File

@ -1,21 +1,30 @@
# https://travis-ci.org/pytorch/pytorch
language: python
dist: trusty
python:
- 2.7.8
- 2.7.9
- 2.7
- 3.3
- 3.4
- 3.5
- 3.6
- nightly
cache:
- ccache
- directories:
- $HOME/.ccache
install:
- export CC="gcc-4.8"
- export CXX="g++-4.8"
- travis_retry pip install -r requirements.txt
- travis_retry pip install .
- unset CCACHE_DISABLE
- export CCACHE_DIR=$HOME/.ccache
- export CC="ccache gcc-4.8"
- export CXX="ccache g++-4.8"
- ccache --show-stats
- travis_retry pip install --upgrade pip setuptools wheel
- travis_retry pip install -r requirements.txt --only-binary=scipy
- python setup.py install
script:
- ./test/run_test.sh
- OMP_NUM_THREADS=2 ./test/run_test.sh
addons:
apt:
@ -32,3 +41,9 @@ sudo: false
matrix:
fast_finish: true
include:
env: LINT_CHECK
python: "2.7"
addons: true
install: pip install flake8
script: flake8

185
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,185 @@
## Contributing to PyTorch
If you are interested in contributing to PyTorch, your contributions will fall
into two categories:
1. You want to propose a new Feature and implement it
- post about your intended feature, and we shall discuss the design and
implementation. Once we agree that the plan looks good, go ahead and implement it.
2. You want to implement a feature or bug-fix for an outstanding issue
- Look at the outstanding issues here: https://github.com/pytorch/pytorch/issues
- Especially look at the Low Priority and Medium Priority issues
- Pick an issue and comment on the task that you want to work on this feature
- If you need more context on a particular issue, please ask and we shall provide.
Once you finish implementing a feature or bugfix, please send a Pull Request to
https://github.com/pytorch/pytorch
If you are not familiar with creating a Pull Request, here are some guides:
- http://stackoverflow.com/questions/14680711/how-to-do-a-github-pull-request
- https://help.github.com/articles/creating-a-pull-request/
## Developing locally with PyTorch
To locally develop with PyTorch, here are some tips:
1. Uninstall all existing pytorch installs
```
conda uninstall pytorch
pip uninstall torch
pip uninstall torch # run this command twice
```
2. Locally clone a copy of PyTorch from source:
```
git clone https://github.com/pytorch/pytorch
cd pytorch
```
3. Install PyTorch in `build develop` mode:
A full set of instructions on installing PyTorch from Source are here:
https://github.com/pytorch/pytorch#from-source
The change you have to make is to replace
```
python setup.py install
```
with
```
python setup.py build develop
```
This is especially useful if you are only changing Python files.
This mode will symlink the python files from the current local source tree into the
python install.
Hence, if you modify a python file, you do not need to reinstall pytorch again and again.
For example:
- Install local pytorch in `build develop` mode
- modify your python file `torch/__init__.py` (for example)
- test functionality
- modify your python file `torch/__init__.py`
- test functionality
- modify your python file `torch/__init__.py`
- test functionality
You do not need to repeatedly install after modifying python files.
## Writing documentation
PyTorch uses [Google style](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)
for formatting docstrings. Length of line inside docstrings block must be limited to 80 characters to
fit into Jupyter documentation popups.
## Managing multiple build trees
One downside to using `python setup.py develop` is that your development
version of pytorch will be installed globally on your account (e.g., if
you run `import torch` anywhere else, the development version will be
used.
If you want to manage multiple builds of PyTorch, you can make use of
[conda environments](https://conda.io/docs/using/envs.html) to maintain
separate Python package environments, each of which can be tied to a
specific build of PyTorch. To set one up:
```
conda create -n pytorch-myfeature
source activate pytorch-myfeature
# if you run python now, torch will NOT be installed
python setup.py build develop
```
## C++ Development tips
If you are working on the C++ code, there are a few important things that you
will want to keep in mind:
1. How to rebuild only the code you are working on, and
2. How to make rebuilds in the absence of changes go faster.
### Build only what you need.
`python setup.py build` will build everything, but since our build system is
not very optimized for incremental rebuilds, this will actually be very slow.
Far better is to only request rebuilds of the parts of the project you are
working on:
- Working on `torch/csrc`? Run `python setup.py develop` to rebuild
(NB: no `build` here!)
- Working on `torch/lib/TH`, did not make any cmake changes, and just want to
see if it compiles? Run `(cd torch/lib/build/TH && make install -j$(getconf _NPROCESSORS_ONLN))`. This
applies for any other subdirectory of `torch/lib`. **Warning: Changes you
make here will not be visible from Python.** See below.
- Working on `torch/lib` and want to run your changes / rerun cmake? Run
`python setup.py build_deps`. Note that this will rerun cmake for
every subdirectory in TH; if you are only working on one project,
consider editing `torch/lib/build_all.sh` and commenting out the
`build` lines of libraries you are not working on.
On the initial build, you can also speed things up with the environment
variables `DEBUG` and `NO_CUDA`.
- `DEBUG=1` will enable debug builds (-g -O0)
- `NO_CUDA=1` will disable compiling CUDA (in case you are developing on something not CUDA related), to save compile time.
For example:
```
NO_CUDA=1 DEBUG=1 python setup.py build develop
```
Make sure you continue to pass these flags on subsequent builds.
### Make no-op build fast.
Python `setuptools` is pretty dumb, and always rebuilds every C file in a
project. Using ccache in a situation like this is a real time-saver. However, by
default, ccache does not properly support CUDA stuff, so here are the
instructions for installing a custom `ccache` fork that has CUDA support:
```
# install and export ccache
if ! ls ~/ccache/bin/ccache
then
sudo apt-get update
sudo apt-get install -y automake autoconf
sudo apt-get install -y asciidoc
mkdir -p ~/ccache
pushd /tmp
rm -rf ccache
git clone https://github.com/colesbury/ccache -b ccbin
pushd ccache
./autogen.sh
./configure
make install prefix=~/ccache
popd
popd
mkdir -p ~/ccache/lib
mkdir -p ~/ccache/cuda
ln -s ~/ccache/bin/ccache ~/ccache/lib/cc
ln -s ~/ccache/bin/ccache ~/ccache/lib/c++
ln -s ~/ccache/bin/ccache ~/ccache/lib/gcc
ln -s ~/ccache/bin/ccache ~/ccache/lib/g++
ln -s ~/ccache/bin/ccache ~/ccache/cuda/nvcc
~/ccache/bin/ccache -M 25Gi
fi
export PATH=~/ccache/lib:$PATH
export CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
```
Hope this helps, and thanks for considering to contribute.

36
Dockerfile Normal file
View File

@ -0,0 +1,36 @@
FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
vim \
ca-certificates \
libjpeg-dev \
libpng-dev &&\
rm -rf /var/lib/apt/lists/*
RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install conda-build && \
/opt/conda/bin/conda create -y --name pytorch-py35 python=3.5.2 numpy pyyaml scipy ipython mkl&& \
/opt/conda/bin/conda clean -ya
ENV PATH /opt/conda/envs/pytorch-py35/bin:$PATH
RUN conda install --name pytorch-py35 -c soumith magma-cuda80
# This must be done before pip so that requirements.txt is available
WORKDIR /opt/pytorch
COPY . .
RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
pip install -v .
RUN git clone https://github.com/pytorch/vision.git && cd vision && pip install -v .
WORKDIR /workspace
RUN chmod -R a+w /workspace

View File

@ -1,3 +1,4 @@
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
@ -19,9 +20,9 @@ modification, are permitted provided that the following conditions are met:
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

550
README.md
View File

@ -1,352 +1,254 @@
# pytorch [alpha-3]
<p align="center"><img width="40%" src="docs/source/_static/img/pytorch-logo-dark.png" /></p>
| Python | **`Linux CPU`** | **`Linux GPU`** |
|--------|--------------------|------------------|
| 2.7.8 | [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | |
| 2.7 | [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py2)](https://build.pytorch.org/job/pytorch-master-py2) |
| 3.3 | [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | |
| 3.4 | [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | |
| 3.5 | [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py3)](https://build.pytorch.org/job/pytorch-master-py3) |
| Nightly| [![Build Status](https://travis-ci.com/apaszke/pytorch.svg?token=shqHbUq29zKDxuqzGcjC&branch=master)](https://travis-ci.com/apaszke/pytorch) | |
--------------------------------------------------------------------------------
The project is still under active development and is likely to drastically change in short periods of time.
We will be announcing API changes and important developments via a newsletter, github issues and post a link to the issues on slack.
Please remember that at this stage, this is an invite-only closed alpha, and please don't distribute code further.
This is done so that we can control development tightly and rapidly during the initial phases with feedback from you.
PyTorch is a Python package that provides two high-level features:
- Tensor computation (like NumPy) with strong GPU acceleration
- Deep neural networks built on a tape-based autograd system
You can reuse your favorite Python packages such as NumPy, SciPy and Cython to extend PyTorch when needed.
We are in an early-release beta. Expect some adventures and rough edges.
- [More about PyTorch](#more-about-pytorch)
- [Installation](#installation)
- [Binaries](#binaries)
- [From Source](#from-source)
- [Docker Image](#docker-image)
- [Getting Started](#getting-started)
- [Communication](#communication)
- [Releases and Contributing](#releases-and-contributing)
- [The Team](#the-team)
| System | 2.7 | 3.5 |
| --- | --- | --- |
| Linux CPU | [![Build Status](https://travis-ci.org/pytorch/pytorch.svg?branch=master)](https://travis-ci.org/pytorch/pytorch) | [![Build Status](https://travis-ci.org/pytorch/pytorch.svg?branch=master)](https://travis-ci.org/pytorch/pytorch) |
| Linux GPU | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py2-linux)](https://build.pytorch.org/job/pytorch-master-py2-linux) | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py3-linux)](https://build.pytorch.org/job/pytorch-master-py3-linux) |
| macOS CPU | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py2-osx-cpu)](https://build.pytorch.org/job/pytorch-master-py2-osx-cpu) | [![Build Status](http://build.pytorch.org:8080/buildStatus/icon?job=pytorch-master-py3-osx-cpu)](https://build.pytorch.org/job/pytorch-master-py3-osx-cpu) |
## More about PyTorch
At a granular level, PyTorch is a library that consists of the following components:
<table>
<tr>
<td><b> torch </b></td>
<td> a Tensor library like NumPy, with strong GPU support </td>
</tr>
<tr>
<td><b> torch.autograd </b></td>
<td> a tape-based automatic differentiation library that supports all differentiable Tensor operations in torch </td>
</tr>
<tr>
<td><b> torch.nn </b></td>
<td> a neural networks library deeply integrated with autograd designed for maximum flexibility </td>
</tr>
<tr>
<td><b> torch.multiprocessing </b></td>
<td> Python multiprocessing, but with magical memory sharing of torch Tensors across processes. Useful for data loading and Hogwild training. </td>
</tr>
<tr>
<td><b> torch.utils </b></td>
<td> DataLoader, Trainer and other utility functions for convenience </td>
</tr>
<tr>
<td><b> torch.legacy(.nn/.optim) </b></td>
<td> legacy code that has been ported over from torch for backward compatibility reasons </td>
</tr>
</table>
Usually one uses PyTorch either as:
- a replacement for NumPy to use the power of GPUs.
- a deep learning research platform that provides maximum flexibility and speed
Elaborating further:
### A GPU-Ready Tensor Library
If you use NumPy, then you have used Tensors (a.k.a ndarray).
<p align=center><img width="30%" src="docs/source/_static/img/tensor_illustration.png" /></p>
PyTorch provides Tensors that can live either on the CPU or the GPU, and accelerate
compute by a huge amount.
We provide a wide variety of tensor routines to accelerate and fit your scientific computation needs
such as slicing, indexing, math operations, linear algebra, reductions.
And they are fast!
### Dynamic Neural Networks: Tape-Based Autograd
PyTorch has a unique way of building neural networks: using and replaying a tape recorder.
Most frameworks such as TensorFlow, Theano, Caffe and CNTK have a static view of the world.
One has to build a neural network, and reuse the same structure again and again.
Changing the way the network behaves means that one has to start from scratch.
With PyTorch, we use a technique called reverse-mode auto-differentiation, which allows you to
change the way your network behaves arbitrarily with zero lag or overhead. Our inspiration comes
from several research papers on this topic, as well as current and past work such as
[autograd](https://github.com/twitter/torch-autograd),
[autograd](https://github.com/HIPS/autograd),
[Chainer](http://chainer.org), etc.
While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
You get the best of speed and flexibility for your crazy research.
<p align=center><img width="80%" src="docs/source/_static/img/dynamic_graph.gif" /></p>
### Python First
PyTorch is not a Python binding into a monolithic C++ framework.
It is built to be deeply integrated into Python.
You can use it naturally like you would use NumPy / SciPy / scikit-learn etc.
You can write your new neural network layers in Python itself, using your favorite libraries
and use packages such as Cython and Numba.
Our goal is to not reinvent the wheel where appropriate.
### Imperative Experiences
PyTorch is designed to be intuitive, linear in thought and easy to use.
When you execute a line of code, it gets executed. There isn't an asynchronous view of the world.
When you drop into a debugger, or receive error messages and stack traces, understanding them is straightforward.
The stack trace points to exactly where your code was defined.
We hope you never spend hours debugging your code because of bad stack traces or asynchronous and opaque execution engines.
### Fast and Lean
PyTorch has minimal framework overhead. We integrate acceleration libraries
such as Intel MKL and NVIDIA (cuDNN, NCCL) to maximize speed.
At the core, its CPU and GPU Tensor and neural network backends
(TH, THC, THNN, THCUNN) are written as independent libraries with a C99 API.
They are mature and have been tested for years.
Hence, PyTorch is quite fast whether you run small or large neural networks.
The memory usage in PyTorch is extremely efficient compared to Torch or some of the alternatives.
We've written custom memory allocators for the GPU to make sure that
your deep learning models are maximally memory efficient.
This enables you to train bigger deep learning models than before.
### Extensions without Pain
Writing new neural network modules, or interfacing with PyTorch's Tensor API was designed to be straightforward
and with minimal abstractions.
You can write new neural network layers in Python using the torch API
[or your favorite NumPy-based libraries such as SciPy](http://pytorch.org/tutorials/advanced/numpy_extensions_tutorial.html).
If you want to write your layers in C/C++, we provide an extension API based on
[cffi](http://cffi.readthedocs.io/en/latest/) that is efficient and with minimal boilerplate.
There is no wrapper code that needs to be written. You can see [a tutorial here](http://pytorch.org/tutorials/advanced/c_extension.html) and [an example here](https://github.com/pytorch/extension-ffi).
## Installation
### Binaries
- Anaconda
Commands to install from binaries via Conda or pip wheels are on our website:
[http://pytorch.org](http://pytorch.org)
### From Source
If you are installing from source, we highly recommend installing an [Anaconda](https://www.continuum.io/downloads) environment.
You will get a high-quality BLAS library (MKL) and you get a controlled compiler version regardless of your Linux distro.
Once you have [Anaconda](https://www.continuum.io/downloads) installed, here are the instructions.
If you want to compile with CUDA support, install
- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 7.5 or above
- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v5.x or above
If you want to disable CUDA support, export environment variable `NO_CUDA=1`.
#### Install optional dependencies
On Linux
```bash
conda install pytorch -c https://conda.anaconda.org/t/6N-MsQ4WZ7jo/soumith
export CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" # [anaconda root directory]
# Install basic dependencies
conda install numpy pyyaml mkl setuptools cmake gcc cffi
# Add LAPACK support for the GPU
conda install -c soumith magma-cuda80 # or magma-cuda75 if CUDA 7.5
```
### From source
On OSX
```bash
pip install -r requirements.txt
pip install .
export CMAKE_PREFIX_PATH=[anaconda root directory]
conda install numpy pyyaml setuptools cmake cffi
```
#### Install PyTorch
On Linux
```bash
python setup.py install
```
On OSX
```bash
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
```
### Docker image
Dockerfile is supplied to build images with cuda support and cudnn v6. Build as usual
```
docker build -t pytorch .
```
Alternatively, if you want a runtime image, build with
```
docker build -t pytorch . -f tools/docker/Dockerfile_runtime
```
and run with nvidia-docker:
```
nvidia-docker run --rm -ti --ipc=host pytorch
```
Please note that PyTorch uses shared memory to share data between processes, so if torch multiprocessing is used (e.g.
for multithreaded data loaders) the default shared memory segment size that container runs with is not enough, and you
should increase shared memory size either with `--ipc=host` or `--shm-size` command line options to `nvidia-docker run`.
## Getting Started
A more comprehensive Getting Started section will be filled in soon.
For now, there's two pointers:
- The MNIST example: [https://github.com/pytorch/examples](https://github.com/pytorch/examples)
- The API Reference: [http://pytorch.org/api/](http://pytorch.org/api/)
Three pointers to get you started:
- [Tutorials: get you started with understanding and using PyTorch](http://pytorch.org/tutorials/)
- [Examples: easy to understand pytorch code across all domains](https://github.com/pytorch/examples)
- The API Reference: [http://pytorch.org/docs/](http://pytorch.org/docs/)
## Communication
* github issues: bug reports, feature requests, install issues, RFCs, thoughts, etc.
* slack: general chat, online discussions, collaboration etc. https://pytorch.slack.com/ . If you need a slack invite, ping me at soumith@pytorch.org
* forums: discuss implementations, research, etc. http://discuss.pytorch.org
* GitHub issues: bug reports, feature requests, install issues, RFCs, thoughts, etc.
* Slack: general chat, online discussions, collaboration etc. https://pytorch.slack.com/ . If you need a slack invite, ping us at soumith@pytorch.org
* newsletter: no-noise, one-way email newsletter with important announcements about pytorch. You can sign-up here: http://eepurl.com/cbG0rv
## Timeline
## Releases and Contributing
We will run the alpha releases weekly for 6 weeks.
After that, we will reevaluate progress, and if we are ready, we will hit beta-0. If not, we will do another two weeks of alpha.
PyTorch has a 90 day release cycle (major releases).
It's current state is Beta, we expect no obvious bugs. Please let us know if you encounter a bug by [filing an issue](https://github.com/pytorch/pytorch/issues).
* ~~alpha-0: Working versions of torch, cutorch, nn, cunn, optim fully unit tested with seamless numpy conversions~~
* ~~alpha-1: Serialization to/from disk with sharing intact. initial release of the new neuralnets package based on a Chainer-like design~~
* ~~alpha-2: sharing tensors across processes for hogwild training or data-loading processes. a rewritten optim package for this new nn.~~
* ~~alpha-3: binary installs, contbuilds, etc.
* alpha-4: a ton of examples across vision, nlp, speech, RL -- this phase might make us rethink parts of the APIs, and hence want to do this in alpha than beta
* alpha-5: Putting a simple and efficient story around multi-machine training. Probably simplistic like torch-distlearn. Building the website, release scripts, more documentation, etc.
* alpha-6: [no plan yet]
We appreciate all contributions. If you are planning to contribute back bug-fixes, please do so without any further discussion.
The beta phases will be leaning more towards working with all of you, convering your use-cases, active development on non-core aspects.
If you plan to contribute new features, utility functions or extensions to the core, please first open an issue and discuss the feature with us.
Sending a PR without discussion might end up resulting in a rejected PR, because we might be taking the core in a different direction than you might be aware of.
## pytorch vs torch: important changes
**For the next release cycle, these are the 3 big features we are planning to add:**
We've decided that it's time to rewrite/update parts of the old torch API, even if it means losing some of backward compatibility (we can hack up a model converter that converts correctly).
This section lists the biggest changes, and suggests how to shift from torch to pytorch.
1. [Distributed PyTorch](https://github.com/pytorch/pytorch/issues/241) (a draft implementation is present in this [branch](https://github.com/apaszke/pytorch-dist) )
2. Backward of Backward - Backpropagating through the optimization process itself. Some past and recent papers such as
[Double Backprop](http://yann.lecun.com/exdb/publis/pdf/drucker-lecun-91.pdf) and [Unrolled GANs](https://arxiv.org/abs/1611.02163) need this.
3. Lazy Execution Engine for autograd - This will enable us to optionally introduce caching and JIT compilers to optimize autograd code.
For now there's no pytorch documentation.
Since all currently implemented modules are very similar to the old ones, it's best to use torch7 docs for now (having in mind several differences described below).
### Library structure
## The Team
All core modules are merged into a single repository.
Most of them will be rewritten and will be completely new (more on this below), but we're providing a Python version of old packages under torch.legacy namespace.
* torch (torch)
* cutorch (torch.cuda)
* nn (torch.legacy.nn)
* cunn (torch.legacy.cunn)
* optim (torch.legacy.optim)
* nngraph (torch.legacy.nngraph - not implemented yet)
PyTorch is a community driven project with several skillful engineers and researchers contributing to it.
### 0-based indexing
pytorch uses 0-based indexing everywhere.
This includes arguments to `index*` functions and nn criterion weights.
Under the hood, on the C side, we've changed logic on TH / THC / THNN / THCUNN to introduce a TH_INDEX_BASE compile-time definition to switch between 0 and 1 indexing logic.
### New Tensor API
**All methods operating on tensors are now out-of-place by default.**
This means that although `a.add(b)` used to have a side-effect of mutating the elements in a, it will now return a new Tensor, holding the result.
All methods that mutate the Tensor/Storage are now marked with a trailing underscore (including `copy` -> `copy_`, `fill` -> `fill_`, `set` -> `set_`, etc.).
Most of math methods have their in-place counterparts, so an equivalent to `a.add(b)` in Lua is now `a.add_(b)` (or `torch.add(a, a, b)`, which is not recommended in this case)
### CUDA module
All tensors have their CUDA counterparts in torch.cuda module.
There is no `torch.cuda.setDevice` anymore. By default always the 0th device is selected, but code can be placed in a `with` statement to change it:
```python
with torch.cuda.device(1):
a = torch.cuda.FloatTensor(10) # a is allocated on GPU1
```
Calling `.cuda()` on tensors no longer converts it to a GPU float tensor, but to a CUDA tensor of the same type located on a currently selected device.
So, for example: `a = torch.LongTensor(10).cuda() # a is a CudaLongTensor`
Calling `.cuda(3)` will send it to the third device.
`.cuda()` can be also used to transfer CUDA tensors between devices (calling it on a GPU tensor, with a different device selected will copy it into the current device).
```python
a = torch.LongTensor(10)
b = a.cuda() # b is a torch.cuda.LongTensor placed on GPU0
c = a.cuda(2) # c is a torch.cuda.LongTensor placed on GPU2
with torch.cuda.device(1):
d = b.cuda() # d is a copy of b, but on GPU1
e = d.cuda() # a no-op, d is already on current GPU, e is d == True
```
Also, setting device is now only important to specify where to allocate new Tensors. You can perform operations on CUDA Tensors irrespective of currently selected device (but all arguments have to be on the same device) - result will be also allocated there. See below for an example:
```python
a = torch.randn(2, 2).cuda()
b = torch.randn(2, 2).cuda()
with torch.cuda.device(1):
c = a + b # c is on GPU0
d = torch.randn(2, 2).cuda() # d is on GPU1
```
In the near future, we also plan to use a CUDA allocator, which allows to alleviate problems with cudaMalloc/cudaFree being a sync point.
This will help us to not worry about using buffers for every intermediate computation in a module if one wants to do multi-GPU training, for example.
See: https://github.com/torch/cutorch/pull/443
### Numpy integration
Because numpy is a core numerical package in Python, and is used by many other libraries like matplotlib, we've implemented a two-way bridge between pytorch and numpy.
```python
a = torch.randn(2, 2)
b = a.numpy() # b is a numpy array of type corresponding to a
# no memory copy is performed, they share the same storage
c = numpy.zeros(5, 5)
d = torch.DoubleTensor(c) # it's possible to construct Tensors from numpy arrays
# d shares memory with b - there's no copy
```
### New neural network module
After looking at several framework designs, looking at the current design of `nn` and thinking through a few original design ideas, this is what we've converged to:
* Adopt a Chainer-like design
* Makes it extremely natural to express Recurrent Nets and weight sharing
* Each module can operate in-place, but marks used variables as dirty - errors will be raised if they're used again
* RNN example:
```python
class Network(nn.Container):
def __init__(self):
super(Network, self).__init__(
conv1=nn.SpatialConvolution(3, 16, 3, 3, 1, 1),
relu1=nn.ReLU(True),
lstm=nn.LSTM(),
)
def __call__(self, input):
y = self.conv(input)
y = self.relu1(y)
y = self.lstm(y)
return y
model = Network()
input = nn.Variable(torch.zeros(256, 3, 224, 224))
output = model(input)
loss = 0
for i in range(ITERS):
input, target = ...
# That's all you need for an RNN
for t in range(TIMESTEPS):
loss += loss_fn(model(input), target)
loss.backward()
```
* Here, nn.Variable will have a complete tape-based automatic differentiation implemented
* To access states, have hooks for forward / backward (this also makes multi-GPU easier to implement)
* This has the advantage of not having to worry about in-place / out-of-place operators for accessing .output or .gradInput
* When writing the module, make sure debuggability is straight forward. Dropping into pdb and inspecting things should be natural, especially when going over the backward graph.
* Pulling handles to a module after constructing a chain should be very natural (apart from having a handle at construction)
* It's easy, since modules are assigned as Container properties
* Drop overly verbose names. Example:
* SpatialConvolution → conv2d
* VolumetricConvolution → conv3d
#### Some notes on new nn implementation
As shown above, structure of the networks is fully defined by control-flow embedded in the code. There are no rigid containers known from Lua. You can put an `if` in the middle of your model and freely branch depending on any condition you can come up with. All operations are registered in the computational graph history.
There are two main objects that make this possible - variables and functions. They will be denoted as squares and circles respectively.
![Variable and function symbols](http://students.mimuw.edu.pl/~ap360585/__torch_img/variable_function.png)
Variables are the objects that hold a reference to a tensor (and optionally to gradient w.r.t. that tensor), and to the function in the computational graph that created it. Variables created explicitly by the user (`Variable(tensor)`) have a Leaf function node associated with them.
![Variable and leaf function](http://students.mimuw.edu.pl/~ap360585/__torch_img/variable_leaf.png)
Functions are simple classes that define a function from a tuple of inputs to a tuple of outputs, and a formula for computing gradient w.r.t. it's inputs. Function objects are instantiated to hold references to other functions, and these references allow to reconstruct the history of a computation. An example graph for a linear layer (`Wx + b`) is shown below.
![Linear layer](http://students.mimuw.edu.pl/~ap360585/__torch_img/linear.png)
Please note that function objects never hold references to Variable objects, except for when they're necessary in the backward pass. This allows to free all the unnecessary intermediate values. A good example for this is addition when computing e.g. (`y = Wx + My`):
![Freeing intermediate values](http://students.mimuw.edu.pl/~ap360585/__torch_img/intermediate_free.png)
Matrix multiplication operation keeps references to it's inputs because it will need them, but addition doesn't need `Wx` and `My` after it computes the result, so as soon as they go out of scope they are freed. To access intermediate values in the forward pass you can either copy them when you still have a reference, or you can use a system of hooks that can be attached to any function. Hooks also allow to access and inspect gradients inside the graph.
Another nice thing about this is that a single layer doesn't hold any state other than it's parameters (all intermediate values are alive as long as the graph references them), so it can be used multiple times before calling backward. This is especially convenient when training RNNs. You can use the same network for all timesteps and the gradients will sum up automatically.
To compute backward pass you can call `.backward()` on a variable if it's a scalar (a 1-element Variable), or you can provide a gradient tensor of matching shape if it's not. This creates an execution engine object that manages the whole backward pass. It's been introduced, so that the code for analyzing the graph and scheduling node processing order is decoupled from other parts, and can be easily replaced. Right now it's simply processing the nodes in topological order, without any prioritization, but in the future we can implement algorithms and heuristics for scheduling independent nodes on different GPU streams, deciding which branches to compute first, etc.
### Serialization
Pickling tensors is supported, but requires making a temporary copy of all data and breaks sharing.
For this reason we're providing `torch.load` and `torch.save`, that are free of these problems.
They have the same interfaces as `pickle.load` (file object) and `pickle.dump` (serialized object, file object) respectively.
For now the only requirement is that the file should have a `fileno` method, which returns a file descriptor number (this is already implemented by objects returned by `open`).
Objects are serialized in a tar archive consisting of four files:
`sys_info` - protocol version, byte order, long size, etc.
`pickle` - pickled object
`tensors` - tensor metadata
`storages` - serialized data
### Multi-GPU
Proposed solutions need to address:
* Kernel launch latency
* without affecting the user's code
* Implementation should be as transparent as possible
* Should we expose DPT as:
* Split
* ParallelApply (scheduling kernels in breadth first order, to address launch latency)
* Join
* In backward phase, send parameters as soon as the module finishes computation
**Rough solution:**
```python
# This is an example of a network that has a data parallel part inside
#
# B is data parallel
# +->A+-->B+-+
# +--+ +->D
# +->C+------+
class Network(nn.Container):
__init__(self):
super(Network, self).__init__(
A = ...,
B = GPUReplicate(B, [0, 1, 2, 3]), # Copies the module onto a list of GPUs
C = ...,
D = ...
)
__call__(self, x):
a = self.A(x)
c = self.C(x)
a_split = Split(a) # a_split is a list of Tensors placed on different devices
b = ParallelApply(self.B, a_split) # self.B is a list-like object containing copies of B
d_input = Join(b + [c]) # gathers Tensors on a single GPU
return self.D(d_input)
```
Each module is assigned to a single GPU.
For Kernel Launch Latency:
* Python threading
* Generators
For parameter reductions ASAP:
* In the forward pass, register a hooks on a every parameter which are evaluated as soon as the last backward is executed for that parameter. The hook will then “all-reduce” those parameters across GPUs
* Problem with multiple forward calls - how do you know that the parameters won't be used anymore?
* Well, last usage in backward graph = first usage in forward graph, so this should be straightforward
### Multiprocessing with Tensor sharing
In Torch, or in general, one uses "threads" to build parallel data loaders, as well as to do Hogwild training.
Threads are powerful, as one can share Tensors between threads.
This allows you to:
* transfer data between threads with efficiently with zero memory copy and serialization overhead.
* share tensors among threads for parameter sharing models
Sharing Tensors among threads is very useful when you do Hogwild training, i.e. if you want to train several models in parallel, but want to share their underlying parameters.
This is often used in non ConvNets, like training word embeddings, RL-for-games, etc.
With Python, one cannot use threads because of a few technical issues.
Python has what is called [Global Interpreter Lock](https://wiki.python.org/moin/GlobalInterpreterLock), which does not allow threads to concurrently execute python code.
Hence, the most pythonic way to use multiple CPU cores is [multiprocessing](http://docs.python.org/2/library/multiprocessing.html)
We made PyTorch to seamlessly integrate with python multiprocessing.
This involved solving some complex technical problems to make this an air-tight solution, and more can be read [in this in-depth technical discussion](http://github.com/pytorch/pytorch/wiki/Multiprocessing-Technical-Notes).
What this means for you as the end-user is that you can simply use multiprocessing in this way:
```python
# loaders.py
# Functions from this file run in the workers
def fill(queue):
while True:
tensor = queue.get()
tensor.fill_(10)
queue.put(tensor)
def fill_pool(tensor):
tensor.fill_(10)
```
```python
# Example 1: Using multiple persistent processes and a Queue
# process.py
import torch
import torch.multiprocessing as multiprocessing
from loaders import fill
# torch.multiprocessing.Queue automatically moves Tensor data to shared memory
# So the main process and worker share the data
queue = multiprocessing.Queue()
buffers = [torch.Tensor(2, 2) for i in range(4)]
for b in buffers:
queue.put(b)
processes = [multiprocessing.Process(target=fill, args=(queue,)).start() for i in range(10)]
```
```python
# Example 2: Using a process pool
# pool.py
import torch
from torch.multiprocessing import Pool
from loaders import fill_pool
tensors = [torch.Tensor(2, 2) for i in range(100)]
pool = Pool(10)
pool.map(fill_pool, tensors)
```
PyTorch is currently maintained by [Adam Paszke](https://apaszke.github.io/), [Sam Gross](https://github.com/colesbury) and [Soumith Chintala](http://soumith.ch) with major contributions coming from 10s of talented individuals in various forms and means. A non-exhaustive but growing list needs to mention: Sergey Zagoruyko, Adam Lerer, Francisco Massa, Andreas Kopf, James Bradbury, Zeming Lin, Yuandong Tian, Guillaume Lample, Marat Dukhan, Natalia Gimelshein.
Note: this project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor in the Torch community and has helped with many things Torch and PyTorch.

View File

@ -685,17 +685,21 @@ endif()
# CUDA_NVCC_EXECUTABLE
cuda_find_host_program(CUDA_NVCC_EXECUTABLE
NAMES nvcc
PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
ENV CUDA_PATH
ENV CUDA_BIN_PATH
PATH_SUFFIXES bin bin64
NO_DEFAULT_PATH
)
# Search default search paths, after we search our own set of paths.
cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
mark_as_advanced(CUDA_NVCC_EXECUTABLE)
if(DEFINED ENV{CUDA_NVCC_EXECUTABLE})
SET(CUDA_NVCC_EXECUTABLE "$ENV{CUDA_NVCC_EXECUTABLE}")
else(DEFINED ENV{CUDA_NVCC_EXECUTABLE})
cuda_find_host_program(CUDA_NVCC_EXECUTABLE
NAMES nvcc
PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
ENV CUDA_PATH
ENV CUDA_BIN_PATH
PATH_SUFFIXES bin bin64
NO_DEFAULT_PATH
)
# Search default search paths, after we search our own set of paths.
cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
mark_as_advanced(CUDA_NVCC_EXECUTABLE)
endif(DEFINED ENV{CUDA_NVCC_EXECUTABLE})
if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
# Compute the version.

View File

@ -63,11 +63,16 @@ function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
"}\n")
execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}"
"-ccbin" ${CMAKE_CXX_COMPILER}
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(nvcc_res EQUAL 0)
# only keep the last line of nvcc_out
STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
list(GET nvcc_out -1 nvcc_out)
string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE)
endif()
@ -116,13 +121,13 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
set(add_ptx TRUE)
set(arch_name ${CMAKE_MATCH_1})
endif()
if(arch_name MATCHES "([0-9]\\.[0-9])$")
if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
set(arch_bin ${CMAKE_MATCH_1})
set(arch_ptx ${arch_bin})
else()
# Look for it in our list of known architectures
if(${arch_name} STREQUAL "Fermi")
set(arch_bin 2.0 "2.1(2.0)")
set(arch_bin "2.0 2.1(2.0)")
elseif(${arch_name} STREQUAL "Kepler+Tegra")
set(arch_bin 3.2)
elseif(${arch_name} STREQUAL "Kepler+Tesla")
@ -173,11 +178,11 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
# Tell NVCC to add binaries for the specified GPUs
foreach(arch ${cuda_arch_bin})
if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
# User explicitly specified PTX for the concrete BIN
# User explicitly specified ARCH for the concrete CODE
list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
else()
# User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
# User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
list(APPEND nvcc_archs_readable sm_${arch})
endif()

27
docs/Makefile Normal file
View File

@ -0,0 +1,27 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = PyTorch
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
docset: html
doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url http://pytorch.org/docs/ --force $(BUILDDIR)/html/
# Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution.
cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png
.PHONY: help Makefile docset
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

View File

@ -1,435 +0,0 @@
#! /usr/bin/env python
# encoding: utf-8
"""
Very lightweight docstring to Markdown converter. Modified for use in pytorch
### License
Copyright © 2013 Thomas Gläßle <t_glaessle@gmx.de>
This work is free. You can redistribute it and/or modify it under the
terms of the Do What The Fuck You Want To Public License, Version 2, as
published by Sam Hocevar. See the COPYING file for more details.
This program is free software. It comes without any warranty, to the
extent permitted by applicable law.
### Description
Little convenience tool to extract docstrings from a module or class and
convert them to GitHub Flavoured Markdown:
https://help.github.com/articles/github-flavored-markdown
Its purpose is to quickly generate `README.md` files for small projects.
### API
The interface consists of the following functions:
- `doctrim(docstring)`
- `doc2md(docstring, title)`
You can run this script from the command line like:
$ doc2md.py [-a] [--no-toc] [-t title] module-name [class-name] > README.md
### Limitations
At the moment this is suited only for a very specific use case. It is
hardly forseeable, if I will decide to improve on it in the near future.
"""
import re
import sys
import inspect
__all__ = ['doctrim', 'doc2md']
doctrim = inspect.cleandoc
def unindent(lines):
"""
Remove common indentation from string.
Unlike doctrim there is no special treatment of the first line.
"""
try:
# Determine minimum indentation:
indent = min(len(line) - len(line.lstrip())
for line in lines if line)
except ValueError:
return lines
else:
return [line[indent:] for line in lines]
def escape_markdown(line):
line = line.replace('[', '\[').replace(']', '\]')
line = line.replace('(', '\(').replace(')', '\)')
line = line.replace('{', '\{').replace('}', '\}')
line = line.replace('\\', '\\\\')
line = line.replace('`', '\`')
line = line.replace('*', '\*')
line = line.replace('_', '\_')
line = line.replace('#', '\#')
line = line.replace('+', '\+')
line = line.replace('-', '\-')
line = line.replace('.', '\.')
line = line.replace('!', '\!')
return line
def code_block(lines, language=''):
"""
Mark the code segment for syntax highlighting.
"""
return ['```' + language] + lines + ['```']
def doctest2md(lines):
"""
Convert the given doctest to a syntax highlighted markdown segment.
"""
is_only_code = True
lines = unindent(lines)
for line in lines:
if not line.startswith('>>> ') and not line.startswith('... ') and line not in ['>>>', '...']:
is_only_code = False
break
if is_only_code:
orig = lines
lines = []
for line in orig:
lines.append(line[4:])
return lines
def doc_code_block(lines, language):
if language == 'python':
lines = doctest2md(lines)
return code_block(lines, language)
_args_section = re.compile('^\s*Args:\s*')
def is_args_check(line):
return _args_section.match(line)
def args_block(lines):
out = ['']
out += ['Parameter | Default | Description']
out += ['--------- | ------- | -----------']
for line in lines:
matches = re.findall(r'\s*([^:]+):\s*(.*?)\s*(Default:\s(.*))?\s*$', line)
assert matches != None
name = matches[0][0]
description = matches[0][1]
default = matches[0][3]
out += [name + ' | ' + default + ' | ' + description]
return out
_returns_section = re.compile('^\s*Returns:\s*')
def is_returns_check(line):
return _returns_section.match(line)
_image_section = re.compile('^\s*Image:\s*')
def is_image_check(line):
return _image_section.match(line)
_example_section = re.compile('^\s*Returns:\s*|^\s*Examples:\s*')
def is_example_check(line):
return _example_section.match(line)
_inputshape_section = re.compile('^\s*Returns:\s*|^\s*Input Shape:\s*')
def is_inputshape_check(line):
return _inputshape_section.match(line)
_outputshape_section = re.compile('^\s*Returns:\s*|^\s*Output Shape:\s*')
def is_outputshape_check(line):
return _outputshape_section.match(line)
#def get_docargs(line)
_reg_section = re.compile('^#+ ')
def is_heading(line):
return _reg_section.match(line)
def get_heading(line):
assert is_heading(line)
part = line.partition(' ')
return len(part[0]), part[2]
def make_heading(level, title):
return '#'*max(level, 1) + ' ' + title
def find_sections(lines):
"""
Find all section names and return a list with their names.
"""
sections = []
for line in lines:
if is_heading(line):
sections.append(get_heading(line))
return sections
def make_toc(sections):
"""
Generate table of contents for array of section names.
"""
if not sections:
return []
outer = min(n for n,t in sections)
refs = []
for ind,sec in sections:
ref = sec.lower()
ref = ref.replace(' ', '-')
ref = ref.replace('?', '')
refs.append(" "*(ind-outer) + "- [%s](#%s)" % (sec, ref))
return refs
def _doc2md(lines, shiftlevel=0):
_doc2md.md = []
_doc2md.is_code = False
_doc2md.is_code_block = False
_doc2md.is_args = False
_doc2md.is_returns = False
_doc2md.is_inputshape = False
_doc2md.is_outputshape = False
_doc2md.code = []
def reset():
if _doc2md.is_code:
_doc2md.is_code = False
_doc2md.code += doc_code_block(code, 'python')
_doc2md.code += ['']
if _doc2md.is_code_block:
_doc2md.is_code_block = False
_doc2md.code += doc_code_block(code_block, 'python')
_doc2md.code += ['']
if _doc2md.is_args:
_doc2md.is_args = False
_doc2md.md += args_block(args)
if _doc2md.is_returns:
_doc2md.is_returns = False
_doc2md.md += returns
_doc2md.is_inputshape = False
_doc2md.is_outputshape = False
for line in lines:
trimmed = line.lstrip()
if is_args_check(line):
reset()
_doc2md.is_args = True
_doc2md.md += ['']
_doc2md.md += ['#' * (shiftlevel+2) + ' Constructor Arguments']
args = []
elif is_returns_check(line):
reset()
_doc2md.is_returns = True
_doc2md.md += ['']
_doc2md.md += ['#' * (shiftlevel+2) + ' Returns']
returns = []
elif is_example_check(line):
reset()
elif is_inputshape_check(line):
reset()
inputshape = re.findall(r'\s*Input\sShape:\s*(.*)\s*:\s*(.*)\s*$', line)[0]
elif is_outputshape_check(line):
reset()
outputshape = re.findall(r'\s*Output\sShape:\s*(.*)\s*:\s*(.*)\s*$', line)[0]
_doc2md.md += ['']
_doc2md.md += ['#' * (shiftlevel+2) + ' Expected Shape']
_doc2md.md += [' | Shape | Description ']
_doc2md.md += ['------ | ----- | ------------']
_doc2md.md += [' input | ' + inputshape[0] + ' | ' + inputshape[1]]
_doc2md.md += ['output | ' + outputshape[0] + ' | ' + outputshape[1]]
elif is_image_check(line):
reset()
_doc2md.md += ['']
filename = re.findall(r'\s*Image:\s*(.*?)\s*$', line)
_doc2md.md += ['<img src="image/' + filename[0] + '" >']
elif _doc2md.is_code == False and trimmed.startswith('>>> '):
reset()
_doc2md.is_code = True
code = [line]
elif _doc2md.is_code_block == False and trimmed.startswith('```'):
reset()
_doc2md.is_code_block = True
code_block = []
elif _doc2md.is_code_block == True and trimmed.startswith('```'):
# end of code block
reset()
elif shiftlevel != 0 and is_heading(line):
reset()
level, title = get_heading(line)
_doc2md.md += [make_heading(level + shiftlevel, title)]
elif _doc2md.is_args:
if line:
args.append(line)
else:
reset()
elif _doc2md.is_returns:
if line:
returns.append(line)
else:
reset()
elif _doc2md.is_code:
if line:
code.append(line)
else:
reset()
elif _doc2md.is_code_block:
if line:
code_block.append(line)
else:
reset()
else:
reset()
_doc2md.md += [line]
reset()
_doc2md.code += _doc2md.md
return _doc2md.code
def doc2md(docstr, title, min_level=1, more_info=False, toc=True):
"""
Convert a docstring to a markdown text.
"""
text = doctrim(docstr)
lines = text.split('\n')
sections = find_sections(lines)
if sections:
level = min(n for n,t in sections) - 1
else:
level = 1
shiftlevel = 0
if level < min_level:
shiftlevel = min_level - level
level = min_level
sections = [(lev+shiftlevel, tit) for lev,tit in sections]
md = [
make_heading(level, title),
"",
lines.pop(0),
""
]
if toc:
md += make_toc(sections)
md += _doc2md(lines, shiftlevel)
if more_info:
return (md, sections)
else:
return "\n".join(md)
def mod2md(module, title, title_api_section, toc=True):
"""
Generate markdown document from module, including API section.
"""
docstr = module.__doc__ or " "
text = doctrim(docstr)
lines = text.split('\n')
sections = find_sections(lines)
if sections:
level = min(n for n,t in sections) - 1
else:
level = 1
api_md = []
api_sec = []
if title_api_section :
# sections.append((level+1, title_api_section))
for name, entry in module.__dict__.items():
if name[0] != '_' and entry.__doc__:
#api_sec.append((level+1, name))
#api_md += ['', '']
if entry.__doc__:
md, sec = doc2md(entry.__doc__, name,
min_level=level+1, more_info=True, toc=False)
api_sec += sec
api_md += md
sections += api_sec
# headline
md = [
make_heading(level, title),
"",
lines.pop(0),
""
]
# main sections
if toc:
md += make_toc(sections)
md += _doc2md(lines)
if toc:
md += ['']
md += make_toc(api_sec)
md += api_md
return "\n".join(md)
def main(args=None):
# parse the program arguments
import argparse
parser = argparse.ArgumentParser(
description='Convert docstrings to markdown.')
parser.add_argument(
'module', help='The module containing the docstring.')
group = parser.add_mutually_exclusive_group()
group.add_argument(
'entry', nargs='?',
help='Convert only docstring of this entry in module.')
group.add_argument(
'-a', '--all', dest='all', action='store_true',
help='Create an API section with the contents of module.__all__.')
parser.add_argument(
'-t', '--title', dest='title',
help='Document title (default is module name)')
parser.add_argument(
'--no-toc', dest='toc', action='store_false', default=True,
help='Do not automatically generate the TOC')
args = parser.parse_args(args)
import importlib
import inspect
import os
def add_path(*pathes):
for path in reversed(pathes):
if path not in sys.path:
sys.path.insert(0, path)
file = inspect.getfile(inspect.currentframe())
add_path(os.path.realpath(os.path.abspath(os.path.dirname(file))))
add_path(os.getcwd())
mod_name = args.module
if mod_name.endswith('.py'):
mod_name = mod_name.rsplit('.py', 1)[0]
title = args.title or mod_name.replace('_', '-')
module = importlib.import_module(mod_name)
if args.all:
print(mod2md(module, title, 'API', toc=args.toc))
else:
if args.entry:
docstr = module.__dict__[args.entry].__doc__ or ''
else:
docstr = module.__doc__ or ''
print(doc2md(docstr, title, toc=args.toc))
if __name__ == "__main__":
main()

View File

@ -1,6 +0,0 @@
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
pushd $SCRIPT_DIR
python doc2md.py torch.nn --no-toc --all >../nn.md
popd

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.2 KiB

36
docs/make.bat Normal file
View File

@ -0,0 +1,36 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
set SPHINXPROJ=PyTorch
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd

View File

@ -1,526 +0,0 @@
# torch.nn
## LogSoftmax
Applies the Log(Softmax(x)) function to an n-dimensional input Tensor.
The LogSoftmax formulation can be simplified as
f_i(x) = log(1 / a * exp(x_i)) where a = sum_j exp(x_j) .
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | [ * , * ] | 2D Tensor of any size
output | Same | Output has the same shape as input
<img src="image/logsoftmax.png" >
```python
m = nn.LogSoftmax()
input = autograd.Variable(torch.randn(2, 3))
print(input)
print(m.forward(input))
```
## ReLU
Applies the rectified linear unit function element-wise ReLU(x)= max(0,x)
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
inplace | | can optionally do the operation in-place
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/relu.png" >
```python
m = nn.ReLU()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## LogSigmoid
Applies element-wise LogSigmoid(x) = log( 1 / (1 + exp(-x_i)))
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/logsigmoid.png" >
```python
m = nn.LogSigmoid()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## PReLU
Applies element-wise the function PReLU(x) = max(0,x) + a * min(0,x)
Here "a" is a learnable parameter.
When called without arguments, nn.PReLU() uses a single parameter "a"
across all input channels. If called with nn.PReLU(nChannels), a separate
"a" is used for each input channel.
Note that weight decay should not be used when learning "a" for good
performance.
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
num_parameters | 1 | number of "a" to learn.
init | 0.25 | the initial value of "a".
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/prelu.png" >
```python
m = nn.PReLU()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Softmax2d
Applies SoftMax over features to each spatial location
When given an image of Channels x Height x Width, it will
apply Softmax to each location [Channels, h_i, w_j]
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | [ * , * , * , * ] | 4D Tensor of any size
output | Same | Output has the same shape as input
```python
m = nn.Softmax2d()
# you softmax over the 2nd dimension
input = autograd.Variable(torch.randn(2, 3, 12, 13))
print(input)
print(m.forward(input))
```
## ReLU6
Applies the element-wise function ReLU6(x) = min( max(0,x), 6)
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
inplace | | can optionally do the operation in-place
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/relu6.png" >
```python
m = nn.ReLU6()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Tanh
Applies element-wise, Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/tanh.png" >
```python
m = nn.Tanh()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Softplus
Applies element-wise SoftPlus(x) = 1/beta * log(1 + exp(beta * x_i))
SoftPlus is a smooth approximation to the ReLU function and can be used
to constrain the output of a machine to always be positive.
For numerical stability the implementation reverts to the linear function
for inputs above a certain value.
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
beta | 1 | the beta value for the Softplus formulation.
threshold | 20 | values above this revert to a linear function.
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/softplus.png" >
```python
m = nn.Softplus()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Threshold
Thresholds each element of the input Tensor
Threshold is defined as:
y = x if x >= threshold
value if x < threshold
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
threshold | | The value to threshold at
value | | The value to replace with
inplace | | can optionally do the operation in-place
### Returns
Tensor of same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
```python
m = nn.Threshold(0.1, 20)
input = Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Softmin
Applies the Softmin function to an n-dimensional input Tensor
rescaling them so that the elements of the n-dimensional output Tensor
lie in the range (0,1) and sum to 1
Softmin(x) = exp(-x_i - shift) / sum_j exp(-x_j - shift)
where shift = max_i - x_i
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | [ * , * ] | 2D Tensor of any size
output | Same | Output has the same shape as input
<img src="image/softmin.png" >
```python
m = nn.Softmin()
input = autograd.Variable(torch.randn(2, 3))
print(input)
print(m.forward(input))
```
## Softshrink
Applies the soft shrinkage function elementwise
SoftShrinkage operator is defined as:
f(x) = x-lambda, if x > lambda > f(x) = x+lambda, if x < -lambda
f(x) = 0, otherwise
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
lambd | 0.5 | the lambda value for the Softshrink formulation.
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/sshrink.png" >
```python
m = nn.Softshrink()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## ELU
Applies element-wise, ELU(x) = max(0,x) + min(0, alpha * (exp(x) - 1))
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
alpha | 1.0 | the alpha value for the ELU formulation.
inplace | | can optionally do the operation in-place
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/elu.png" >
```python
m = nn.ELU()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Hardshrink
Applies the hard shrinkage function element-wise
Hardshrink is defined as f(x) = x, if x > lambda
f(x) = x, if x < -lambda
f(x) = 0, otherwise
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
lambd | 0.5 | the lambda value for the Hardshrink formulation.
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/hshrink.png" >
```python
m = nn.Hardshrink()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Hardtanh
Applies the HardTanh function element-wise
HardTanh is defined as:
f(x) = +1, if x > 1
f(x) = -1, if x < -1
f(x) = x, otherwise
The range of the linear region [-1, 1] can be adjusted
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
min_value | | minimum value of the linear region range
max_value | | maximum value of the linear region range
inplace | | can optionally do the operation in-place
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/htanh.png" >
```python
m = nn.HardTanh(-2, 2)
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Softsign
Applies element-wise, the function Softsign(x) = x / (1 + |x|)
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/softsign.png" >
```python
m = nn.Softsign()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## LeakyReLU
Applies element-wise, f(x) = max(0, x) + negative_slope * min(0, x)
### Constructor Arguments
Parameter | Default | Description
--------- | ------- | -----------
negative_slope | 1e-2 | Controls the angle of the negative slope.
inplace | | can optionally do the operation in-place
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
```python
m = nn.LeakyReLU(0.1)
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Sigmoid
Applies the element-wise function sigmoid(x) = 1 / ( 1 + exp(-x))
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
<img src="image/sigmoid.png" >
```python
m = nn.Sigmoid()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Tanhshrink
Applies element-wise, Tanhshrink(x) = x - Tanh(x)
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | Any | Tensor of any size and dimension
output | Same | Output has the same shape as input
```python
m = nn.Tanhshrink()
input = autograd.Variable(torch.randn(2))
print(input)
print(m.forward(input))
```
## Softmax
Applies the Softmax function to an n-dimensional input Tensor
rescaling them so that the elements of the n-dimensional output Tensor
lie in the range (0,1) and sum to 1
Softmax is defined as f_i(x) = exp(x_i - shift) / sum_j exp(x_j - shift)
where shift = max_i x_i
### Returns
a Tensor of the same dimension and shape as the input
### Expected Shape
| Shape | Description
------ | ----- | ------------
input | [ * , * ] | 2D Tensor of any size
output | Same | Output has the same shape as input
<img src="image/softmax.png" >
Notes:
Note that this module doesn't work directly with NLLLoss,
which expects the Log to be computed between the Softmax and itself.
Use Logsoftmax instead (it's faster).
```python
m = nn.Softmax()
input = autograd.Variable(torch.randn(2, 3))
print(input)
print(m.forward(input))
```

2
docs/requirements.txt Normal file
View File

@ -0,0 +1,2 @@
sphinx
-e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme

View File

@ -0,0 +1,118 @@
body {
font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
}
/* Default header fonts are ugly */
h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
}
/* Use white for docs background */
.wy-side-nav-search {
background-color: #fff;
}
.wy-nav-content-wrap, .wy-menu li.current > a {
background-color: #fff;
}
@media screen and (min-width: 1400px) {
.wy-nav-content-wrap {
background-color: rgba(0, 0, 0, 0.0470588);
}
.wy-nav-content {
background-color: #fff;
}
}
/* Fixes for mobile */
.wy-nav-top {
background-color: #fff;
background-image: url('../img/pytorch-logo-dark.svg');
background-repeat: no-repeat;
background-position: center;
padding: 0;
margin: 0.4045em 0.809em;
color: #333;
}
.wy-nav-top > a {
display: none;
}
@media screen and (max-width: 768px) {
.wy-side-nav-search>a img.logo {
height: 60px;
}
}
/* This is needed to ensure that logo above search scales properly */
.wy-side-nav-search a {
display: block;
}
/* This ensures that multiple constructors will remain in separate lines. */
.rst-content dl:not(.docutils) dt {
display: table;
}
/* Use our red for literals (it's very similar to the original color) */
.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
color: #F05732;
}
.rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
.rst-content code.xref, a .rst-content tt, a .rst-content code {
color: #404040;
}
/* Change link colors (except for the menu) */
a {
color: #F05732;
}
a:hover {
color: #F05732;
}
a:visited {
color: #D44D2C;
}
.wy-menu a {
color: #b3b3b3;
}
.wy-menu a:hover {
color: #b3b3b3;
}
/* Default footer text is quite big */
footer {
font-size: 80%;
}
footer .rst-footer-buttons {
font-size: 125%; /* revert footer settings - 1/80% = 125% */
}
footer p {
font-size: 100%;
}
/* For hidden headers that appear in TOC tree */
/* see http://stackoverflow.com/a/32363545/3343043 */
.rst-content .hidden-section {
display: none;
}
nav .hidden-section {
display: inherit;
}
.wy-side-nav-search>div.version {
color: #000;
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 258 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

View File

@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Generator: Adobe Illustrator 21.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) -->
<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
viewBox="0 0 199.7 40.2" style="enable-background:new 0 0 199.7 40.2;" xml:space="preserve">
<style type="text/css">
.st0{fill:#F05732;}
.st1{fill:#9E529F;}
.st2{fill:#333333;}
</style>
<path class="st0" d="M102.7,12.2c-1.3-1-1.8,3.9-4.4,3.9c-3,0-4-13-6.3-13c-0.7,0-0.8-0.4-7.9,21.3c-2.9,9,4.4,15.8,11.8,15.8
c4.6,0,12.3-3,12.3-12.6C108.2,20.5,104.7,13.7,102.7,12.2z M95.8,35.3c-3.7,0-6.7-3.1-6.7-7c0-3.9,3-7,6.7-7s6.7,3.1,6.7,7
C102.5,32.1,99.5,35.3,95.8,35.3z"/>
<path class="st1" d="M99.8,0c-0.5,0-1.8,2.5-1.8,3.6c0,1.5,1,2,1.8,2c0.8,0,1.8-0.5,1.8-2C101.5,2.5,100.2,0,99.8,0z"/>
<path class="st2" d="M0,39.5V14.9h11.5c5.3,0,8.3,3.6,8.3,7.9c0,4.3-3,7.9-8.3,7.9H5.2v8.8H0z M14.4,22.8c0-2.1-1.6-3.3-3.7-3.3H5.2
v6.6h5.5C12.8,26.1,14.4,24.8,14.4,22.8z"/>
<path class="st2" d="M35.2,39.5V29.4l-9.4-14.5h6l6.1,9.8l6.1-9.8h5.9l-9.4,14.5v10.1H35.2z"/>
<path class="st2" d="M63.3,39.5v-20h-7.2v-4.6h19.6v4.6h-7.2v20H63.3z"/>
<path class="st2" d="M131.4,39.5l-4.8-8.7h-3.8v8.7h-5.2V14.9H129c5.1,0,8.3,3.4,8.3,7.9c0,4.3-2.8,6.7-5.4,7.3l5.6,9.4H131.4z
M131.9,22.8c0-2-1.6-3.3-3.7-3.3h-5.5v6.6h5.5C130.3,26.1,131.9,24.9,131.9,22.8z"/>
<path class="st2" d="M145.6,27.2c0-7.6,5.7-12.7,13.1-12.7c5.4,0,8.5,2.9,10.3,6l-4.5,2.2c-1-2-3.2-3.6-5.8-3.6
c-4.5,0-7.7,3.4-7.7,8.1c0,4.6,3.2,8.1,7.7,8.1c2.5,0,4.7-1.6,5.8-3.6l4.5,2.2c-1.7,3.1-4.9,6-10.3,6
C151.3,39.9,145.6,34.7,145.6,27.2z"/>
<path class="st2" d="M194.5,39.5V29.1h-11.6v10.4h-5.2V14.9h5.2v9.7h11.6v-9.7h5.3v24.6H194.5z"/>
</svg>

After

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1010 B

View File

@ -0,0 +1,33 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
height="40.200001"
width="40.200001"
xml:space="preserve"
viewBox="0 0 40.200002 40.2"
y="0px"
x="0px"
id="Layer_1"
version="1.1"><metadata
id="metadata4717"><rdf:RDF><cc:Work
rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
id="defs4715" /><style
id="style4694"
type="text/css">
.st0{fill:#F05732;}
.st1{fill:#9E529F;}
.st2{fill:#333333;}
</style><path
style="fill:#f05732"
id="path4696"
d="m 26.975479,12.199999 c -1.3,-1 -1.8,3.9 -4.4,3.9 -3,0 -4,-12.9999998 -6.3,-12.9999998 -0.7,0 -0.8,-0.4 -7.9000003,21.2999998 -2.9000001,9 4.4000003,15.8 11.8000003,15.8 4.6,0 12.3,-3 12.3,-12.6 0,-7.1 -3.5,-13.9 -5.5,-15.4 z m -6.9,23.1 c -3.7,0 -6.7,-3.1 -6.7,-7 0,-3.9 3,-7 6.7,-7 3.7,0 6.7,3.1 6.7,7 0,3.8 -3,7 -6.7,7 z"
class="st0" /><path
style="fill:#9e529f"
id="path4698"
d="m 24.075479,-7.6293945e-7 c -0.5,0 -1.8,2.49999996293945 -1.8,3.59999996293945 0,1.5 1,2 1.8,2 0.8,0 1.8,-0.5 1.8,-2 -0.1,-1.1 -1.4,-3.59999996293945 -1.8,-3.59999996293945 z"
class="st1" /></svg>

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

55
docs/source/autograd.rst Normal file
View File

@ -0,0 +1,55 @@
.. role:: hidden
:class: hidden-section
Automatic differentiation package - torch.autograd
==================================================
.. automodule:: torch.autograd
.. currentmodule:: torch.autograd
.. autofunction:: backward
.. autofunction:: grad
Variable
--------
API compatibility
^^^^^^^^^^^^^^^^^
Variable API is nearly the same as regular Tensor API (with the exception
of a couple in-place methods, that would overwrite inputs required for
gradient computation). In most cases Tensors can be safely replaced with
Variables and the code will remain to work just fine. Because of this,
we're not documenting all the operations on variables, and you should
refer to :class:`torch.Tensor` docs for this purpose.
In-place operations on Variables
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Supporting in-place operations in autograd is a hard matter, and we discourage
their use in most cases. Autograd's aggressive buffer freeing and reuse makes
it very efficient and there are very few occasions when in-place operations
actually lower memory usage by any significant amount. Unless you're operating
under heavy memory pressure, you might never need to use them.
In-place correctness checks
^^^^^^^^^^^^^^^^^^^^^^^^^^^
All :class:`Variable` s keep track of in-place operations applied to them, and
if the implementation detects that a variable was saved for backward in one of
the functions, but it was modified in-place afterwards, an error will be raised
once backward pass is started. This ensures that if you're using in-place
functions and not seeing any errors, you can be sure that the computed
gradients are correct.
.. autoclass:: Variable
:members:
:hidden:`Function`
------------------
.. autoclass:: Function
:members:

249
docs/source/conf.py Normal file
View File

@ -0,0 +1,249 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# PyTorch documentation build configuration file, created by
# sphinx-quickstart on Fri Dec 23 13:31:47 2016.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
import torch
try:
import torchvision
except ImportError:
import warnings
warnings.warn('unable to load "torchvision" package')
import sphinx_rtd_theme
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.doctest',
'sphinx.ext.intersphinx',
'sphinx.ext.todo',
'sphinx.ext.coverage',
'sphinx.ext.mathjax',
'sphinx.ext.napoleon',
'sphinx.ext.viewcode',
]
napoleon_use_ivar = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'PyTorch'
copyright = '2017, Torch Contributors'
author = 'Torch Contributors'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
# TODO: change to [:2] at v1.0
version = 'master (' + torch.__version__ + ' )'
# The full version, including alpha/beta/rc tags.
# TODO: verify this works as expected
release = 'master'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = []
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = True
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {
'collapse_navigation': False,
'display_version': True,
'logo_only': True,
}
html_logo = '_static/img/pytorch-logo-dark.svg'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# html_style_path = 'css/pytorch_theme.css'
html_context = {
'css_files': [
'https://fonts.googleapis.com/css?family=Lato',
'_static/css/pytorch_theme.css'
],
}
# -- Options for HTMLHelp output ------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'PyTorchdoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'pytorch.tex', 'PyTorch Documentation',
'Torch Contributors', 'manual'),
]
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'PyTorch', 'PyTorch Documentation',
[author], 1)
]
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'PyTorch', 'PyTorch Documentation',
author, 'PyTorch', 'One line description of project.',
'Miscellaneous'),
]
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
'python': ('https://docs.python.org/', None),
'numpy': ('http://docs.scipy.org/doc/numpy/', None),
}
# -- A patch that prevents Sphinx from cross-referencing ivar tags -------
# See http://stackoverflow.com/a/41184353/3343043
from docutils import nodes
from sphinx.util.docfields import TypedField
from sphinx import addnodes
def patched_make_field(self, types, domain, items, **kw):
# `kw` catches `env=None` needed for newer sphinx while maintaining
# backwards compatibility when passed along further down!
# type: (List, unicode, Tuple) -> nodes.field
def handle_item(fieldarg, content):
par = nodes.paragraph()
par += addnodes.literal_strong('', fieldarg) # Patch: this line added
# par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
# addnodes.literal_strong))
if fieldarg in types:
par += nodes.Text(' (')
# NOTE: using .pop() here to prevent a single type node to be
# inserted twice into the doctree, which leads to
# inconsistencies later when references are resolved
fieldtype = types.pop(fieldarg)
if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text):
typename = u''.join(n.astext() for n in fieldtype)
typename = typename.replace('int', 'python:int')
typename = typename.replace('long', 'python:long')
typename = typename.replace('float', 'python:float')
typename = typename.replace('type', 'python:type')
par.extend(self.make_xrefs(self.typerolename, domain, typename,
addnodes.literal_emphasis, **kw))
else:
par += fieldtype
par += nodes.Text(')')
par += nodes.Text(' -- ')
par += content
return par
fieldname = nodes.field_name('', self.label)
if len(items) == 1 and self.can_collapse:
fieldarg, content = items[0]
bodynode = handle_item(fieldarg, content)
else:
bodynode = self.list_type()
for fieldarg, content in items:
bodynode += nodes.list_item('', handle_item(fieldarg, content))
fieldbody = nodes.field_body('', bodynode)
return nodes.field('', fieldname, fieldbody)
TypedField.make_field = patched_make_field

34
docs/source/cuda.rst Normal file
View File

@ -0,0 +1,34 @@
torch.cuda
===================================
.. currentmodule:: torch.cuda
.. automodule:: torch.cuda
:members:
Communication collectives
-------------------------
.. autofunction:: torch.cuda.comm.broadcast
.. autofunction:: torch.cuda.comm.reduce_add
.. autofunction:: torch.cuda.comm.scatter
.. autofunction:: torch.cuda.comm.gather
Streams and events
------------------
.. autoclass:: Stream
:members:
.. autoclass:: Event
:members:
NVIDIA Tools Extension (NVTX)
-----------------------------
.. autofunction:: torch.cuda.nvtx.mark
.. autofunction:: torch.cuda.nvtx.range_push
.. autofunction:: torch.cuda.nvtx.range_pop

13
docs/source/data.rst Normal file
View File

@ -0,0 +1,13 @@
torch.utils.data
===================================
.. automodule:: torch.utils.data
.. autoclass:: Dataset
.. autoclass:: TensorDataset
.. autoclass:: DataLoader
.. autoclass:: torch.utils.data.sampler.Sampler
.. autoclass:: torch.utils.data.sampler.SequentialSampler
.. autoclass:: torch.utils.data.sampler.RandomSampler
.. autoclass:: torch.utils.data.sampler.SubsetRandomSampler
.. autoclass:: torch.utils.data.sampler.WeightedRandomSampler
.. autoclass:: torch.utils.data.distributed.DistributedSampler

165
docs/source/distributed.rst Normal file
View File

@ -0,0 +1,165 @@
.. role:: hidden
:class: hidden-section
Distributed communication package - torch.distributed
=====================================================
.. automodule:: torch.distributed
.. currentmodule:: torch.distributed
Currently torch.distributed supports three backends, each with
different capabilities. The table below shows which functions are available
for use with CPU / CUDA tensors.
MPI supports cuda only iff the implementation used to build PyTorch supports it.
+------------+-----------+-----------+-----------+
| Backend | ``tcp`` | ``gloo`` | ``mpi`` |
+------------+-----+-----+-----+-----+-----+-----+
| Device | CPU | GPU | CPU | GPU | CPU | GPU |
+============+=====+=====+=====+=====+=====+=====+
| send | ✓ | ✘ | ✘ | ✘ | ✓ | ? |
+------------+-----+-----+-----+-----+-----+-----+
| recv | ✓ | ✘ | ✘ | ✘ | ✓ | ? |
+------------+-----+-----+-----+-----+-----+-----+
| broadcast | ✓ | ✘ | ✓ | ✓ | ✓ | ? |
+------------+-----+-----+-----+-----+-----+-----+
| all_reduce | ✓ | ✘ | ✓ | ✓ | ✓ | ? |
+------------+-----+-----+-----+-----+-----+-----+
| reduce | ✓ | ✘ | ✘ | ✘ | ✓ | ? |
+------------+-----+-----+-----+-----+-----+-----+
| all_gather | ✓ | ✘ | ✘ | ✘ | ✓ | ? |
+------------+-----+-----+-----+-----+-----+-----+
| gather | ✓ | ✘ | ✘ | ✘ | ✓ | ? |
+------------+-----+-----+-----+-----+-----+-----+
| scatter | ✓ | ✘ | ✘ | ✘ | ✓ | ? |
+------------+-----+-----+-----+-----+-----+-----+
| barrier | ✓ | ✘ | ✓ | ✓ | ✓ | ? |
+------------+-----+-----+-----+-----+-----+-----+
Initialization
--------------
The package needs to be initialized using the :func:`torch.distributed.init_process_group`
function before calling any other methods.
.. autofunction:: init_process_group
.. autofunction:: get_rank
.. autofunction:: get_world_size
--------------------------------------------------------------------------------
Currently three initialization methods are supported:
TCP initialization
^^^^^^^^^^^^^^^^^^
Initialization will utilize a network address reachable from all processes.
If the address belongs to one of the machines, initialization requires that all processes
have manually specified ranks.
Alternatively, the address has to be a valid IP multicast address, in which case,
ranks can be assigned automatically. Multicast initialization also supports
a ``group_name`` argument, which allows you to use the same address for multiple jobs,
as long as they use different group names.
::
import torch.distributed as dist
# Use address of one of the machines
dist.init_process_group(init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4)
# or a multicast address - rank will be assigned automatically if unspecified
dist.init_process_group(init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456',
world_size=4)
Shared file-system initialization
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Another initialization method makes use of a file system shared and visible from
all machines in a group. The URL should start with ``file://`` and contain a path
to a non-existent file (in an existing directory) on a shared file system.
This initialization method also supports a ``group_name`` argument, which allows you to
use the same shared file path for multiple jobs, as long as they use different
group names.
.. warning::
This method assumes that the file system supports locking using ``fcntl`` - most
local systems and NFS support it.
::
import torch.distributed as dist
# Rank will be assigned automatically if unspecified
dist.init_process_group(init_method='file:///mnt/nfs/sharedfile', world_size=4,
group_name=args.group)
Environment variable initialization
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This method will read the configuration from environment variables, allowing
one to fully customize how the information is obtained. The variables to be set
are:
* ``MASTER_PORT`` - required; has to be a free port on machine with rank 0
* ``MASTER_ADDR`` - required (except for rank 0); address of rank 0 node
* ``WORLD_SIZE`` - required; can be set either here, or in a call to init function
* ``RANK`` - required; can be set either here, or in a call to init function
The machine with rank 0 will be used to set up all connections.
This is the default method, meaning that ``init_method`` does not have to be specified (or
can be ``env://``).
Groups
------
By default collectives operate on the default group (also called the world) and
require all processes to enter the distributed function call. However, some workloads can benefit
from more fine-grained communication. This is where distributed groups come
into play. :func:`~torch.distributed.new_group` function can be
used to create new groups, with arbitrary subsets of all processes. It returns
an opaque group handle that can be given as a ``group`` argument to all collectives
(collectives are distributed functions to exchange information in certain well-known programming patterns).
.. autofunction:: new_group
Point-to-point communication
----------------------------
.. autofunction:: send
.. autofunction:: recv
:func:`~torch.distributed.isend` and :func:`~torch.distributed.irecv`
return distributed request objects when used. In general, the type of this object is unspecified
as they should never be created manually, but they are guaranteed to support two methods:
* ``is_completed()`` - returns True if the operation has finished
* ``wait()`` - will block the process until the operation is finished.
``is_completed()`` is guaranteed to return True once it returns.
.. autofunction:: isend
.. autofunction:: irecv
Collective functions
--------------------
.. autofunction:: broadcast
.. autofunction:: all_reduce
.. autofunction:: reduce
.. autofunction:: all_gather
.. autofunction:: gather
.. autofunction:: scatter
.. autofunction:: barrier

6
docs/source/ffi.rst Normal file
View File

@ -0,0 +1,6 @@
torch.utils.ffi
===============
.. currentmodule:: torch.utils.ffi
.. autofunction:: create_extension

56
docs/source/index.rst Normal file
View File

@ -0,0 +1,56 @@
.. PyTorch documentation master file, created by
sphinx-quickstart on Fri Dec 23 13:31:47 2016.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
:github_url: https://github.com/pytorch/pytorch
PyTorch documentation
===================================
PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
.. toctree::
:glob:
:maxdepth: 1
:caption: Notes
notes/*
.. toctree::
:maxdepth: 1
:caption: Package Reference
torch
tensors
sparse
storage
nn
optim
torch.autograd <autograd>
torch.multiprocessing <multiprocessing>
torch.distributed <distributed>
torch.legacy <legacy>
cuda
ffi
data
model_zoo
.. toctree::
:glob:
:maxdepth: 1
:caption: torchvision Reference
torchvision/torchvision
torchvision/datasets
torchvision/models
torchvision/transforms
torchvision/utils
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`

4
docs/source/legacy.rst Normal file
View File

@ -0,0 +1,4 @@
Legacy package - torch.legacy
===================================
.. automodule:: torch.legacy

View File

@ -0,0 +1,5 @@
torch.utils.model_zoo
===================================
.. automodule:: torch.utils.model_zoo
.. autofunction:: load_url

View File

@ -0,0 +1,88 @@
Multiprocessing package - torch.multiprocessing
===============================================
.. automodule:: torch.multiprocessing
.. currentmodule:: torch.multiprocessing
.. warning::
If the main process exits abruptly (e.g. because of an incoming signal),
Python's ``multiprocessing`` sometimes fails to clean up its children.
It's a known caveat, so if you're seeing any resource leaks after
interrupting the interpreter, it probably means that this has just happened
to you.
Strategy management
-------------------
.. autofunction:: get_all_sharing_strategies
.. autofunction:: get_sharing_strategy
.. autofunction:: set_sharing_strategy
Sharing CUDA tensors
--------------------
Sharing CUDA tensors between processes is supported only in Python 3, using
a ``spawn`` or ``forkserver`` start methods. :mod:`python:multiprocessing` in
Python 2 can only create subprocesses using ``fork``, and it's not supported
by the CUDA runtime.
.. warning::
CUDA API requires that the allocation exported to other processes remains
valid as long as it's used by them. You should be careful and ensure that
CUDA tensors you shared don't go out of scope as long as it's necessary.
This shouldn't be a problem for sharing model parameters, but passing other
kinds of data should be done with care. Note that this restriction doesn't
apply to shared CPU memory.
Sharing strategies
------------------
This section provides a brief overview into how different sharing strategies
work. Note that it applies only to CPU tensor - CUDA tensors will always use
the CUDA API, as that's the only way they can be shared.
File descriptor - ``file_descriptor``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. note::
This is the default strategy (except for macOS and OS X where it's not
supported).
This strategy will use file descriptors as shared memory handles. Whenever a
storage is moved to shared memory, a file descriptor obtained from ``shm_open``
is cached with the object, and when it's going to be sent to other processes,
the file descriptor will be transferred (e.g. via UNIX sockets) to it. The
receiver will also cache the file descriptor and ``mmap`` it, to obtain a shared
view onto the storage data.
Note that if there will be a lot of tensors shared, this strategy will keep a
large number of file descriptors open most of the time. If your system has low
limits for the number of open file descriptors, and you can't rise them, you
should use the ``file_system`` strategy.
File system - ``file_system``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This strategy will use file names given to ``shm_open`` to identify the shared
memory regions. This has a benefit of not requiring the implementation to cache
the file descriptors obtained from it, but at the same time is prone to shared
memory leaks. The file can't be deleted right after its creation, because other
processes need to access it to open their views. If the processes fatally
crash, or are killed, and don't call the storage destructors, the files will
remain in the system. This is very serious, because they keep using up the
memory until the system is restarted, or they're freed manually.
To counter the problem of shared memory file leaks, :mod:`torch.multiprocessing`
will spawn a daemon named ``torch_shm_manager`` that will isolate itself from
the current process group, and will keep track of all shared memory allocations.
Once all processes connected to it exit, it will wait a moment to ensure there
will be no new connections, and will iterate over all shared memory files
allocated by the group. If it finds that any of them still exist, they will be
deallocated. We've tested this method and it proved to be robust to various
failures. Still, if your system has high enough limits, and ``file_descriptor``
is a supported strategy, we do not recommend switching to this one.

1082
docs/source/nn.rst Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,151 @@
Autograd mechanics
==================
This note will present an overview of how autograd works and records the
operations. It's not strictly necessary to understand all this, but we recommend
getting familiar with it, as it will help you write more efficient, cleaner
programs, and can aid you in debugging.
.. _excluding-subgraphs:
Excluding subgraphs from backward
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Every Variable has two flags: :attr:`requires_grad` and :attr:`volatile`.
They both allow for fine grained exclusion of subgraphs from gradient
computation and can increase efficiency.
.. _excluding-requires_grad:
``requires_grad``
~~~~~~~~~~~~~~~~~
If there's a single input to an operation that requires gradient, its output
will also require gradient. Conversely, only if all inputs don't require
gradient, the output also won't require it. Backward computation is never
performed in the subgraphs, where all Variables didn't require gradients.
.. code::
>>> x = Variable(torch.randn(5, 5))
>>> y = Variable(torch.randn(5, 5))
>>> z = Variable(torch.randn(5, 5), requires_grad=True)
>>> a = x + y
>>> a.requires_grad
False
>>> b = a + z
>>> b.requires_grad
True
This is especially useful when you want to freeze part of your model, or you
know in advance that you're not going to use gradients w.r.t. some parameters.
For example if you want to finetune a pretrained CNN, it's enough to switch the
:attr:`requires_grad` flags in the frozen base, and no intermediate buffers will
be saved, until the computation gets to the last layer, where the affine
transform will use weights that require gradient, and the output of the network
will also require them.
.. code::
model = torchvision.models.resnet18(pretrained=True)
for param in model.parameters():
param.requires_grad = False
# Replace the last fully-connected layer
# Parameters of newly constructed modules have requires_grad=True by default
model.fc = nn.Linear(512, 100)
# Optimize only the classifier
optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)
``volatile``
~~~~~~~~~~~~
Volatile is recommended for purely inference mode, when you're sure you won't
be even calling `.backward()`. It's more efficient than any other autograd
setting - it will use the absolute minimal amount of memory to evaluate the
model. ``volatile`` also determines that ``requires_grad is False``.
Volatile differs from :ref:`excluding-requires_grad` in how the flag propagates.
If there's even a single volatile input to an operation, its output is also
going to be volatile. Volatility spreads across the graph much easier than
non-requiring gradient - you only need a **single** volatile leaf to have a
volatile output, while you need **all** leaves to not require gradient to
have an output that doesn't require gradient. Using volatile flag you don't
need to change any settings of your model parameters to use it for
inference. It's enough to create a volatile input, and this will ensure that
no intermediate states are saved.
.. code::
>>> regular_input = Variable(torch.randn(1, 3, 227, 227))
>>> volatile_input = Variable(torch.randn(1, 3, 227, 227), volatile=True)
>>> model = torchvision.models.resnet18(pretrained=True)
>>> model(regular_input).requires_grad
True
>>> model(volatile_input).requires_grad
False
>>> model(volatile_input).volatile
True
>>> model(volatile_input).grad_fn is None
True
How autograd encodes the history
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Autograd is reverse automatic differentiation system. Conceptually,
autograd records a graph recording all of the operations that created
the data as you execute operations, giving you a directed acyclic graph
whose leaves are the input variables and roots are the output variables.
By tracing this graph from roots to leaves, you can automatically
compute the gradients using the chain rule.
Internally, autograd represents this graph as a graph of
:class:`Function` objects (really expressions), which can be
:meth:`~torch.autograd.Function.apply` ed to compute the result of
evaluating the graph. When computing the forwards pass, autograd
simultaneously performs the requested computations and builds up a graph
representing the function that computes the gradient (the ``.grad_fn``
attribute of each :class:`Variable` is an entry point into this graph).
When the forwards pass is completed, we evaluate this graph in the
backwards pass to compute the gradients.
An important thing to note is that the graph is recreated from scratch at every
iteration, and this is exactly what allows for using arbitrary Python control
flow statements, that can change the overall shape and size of the graph at
every iteration. You don't have to encode all possible paths before you
launch the training - what you run is what you differentiate.
In-place operations on Variables
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Supporting in-place operations in autograd is a hard matter, and we discourage
their use in most cases. Autograd's aggressive buffer freeing and reuse makes
it very efficient and there are very few occasions when in-place operations
actually lower memory usage by any significant amount. Unless you're operating
under heavy memory pressure, you might never need to use them.
There are two main reasons that limit the applicability of in-place operations:
1. Overwriting values required to compute gradients. This is why variables don't
support ``log_``. Its gradient formula requires the original input, and while
it is possible to recreate it by computing the inverse operation, it is
numerically unstable, and requires additional work that often defeats the
purpose of using these functions.
2. Every in-place operation actually requires the implementation to rewrite the
computational graph. Out-of-place versions simply allocate new objects and
keep references to the old graph, while in-place operations, require
changing the creator of all inputs to the :class:`Function` representing
this operation. This can be tricky, especially if there are many Variables
that reference the same storage (e.g. created by indexing or transposing),
and in-place functions will actually raise an error if the storage of
modified inputs is referenced by any other :class:`Variable`.
In-place correctness checks
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Every variable keeps a version counter, that is incremented every time it's
marked dirty in any operation. When a Function saves any tensors for backward,
a version counter of their containing Variable is saved as well. Once you access
``self.saved_tensors`` it is checked, and if it's greater than the saved value
an error is raised.

View File

@ -0,0 +1,113 @@
.. _broadcasting-semantics:
Broadcasting semantics
======================
Many PyTorch operations support :any:`NumPy Broadcasting Semantics <numpy.doc.broadcasting>`.
In short, if a PyTorch operation supports broadcast, then its Tensor arguments can be
automatically expanded to be of equal sizes (without making copies of the data).
General semantics
-----------------
Two tensors are "broadcastable" if the following rules hold:
- Each tensor has at least one dimension.
- When iterating over the dimension sizes, starting at the trailing dimension,
the dimension sizes must either be equal, one of them is 1, or one of them
does not exist.
For Example::
>>> x=torch.FloatTensor(5,7,3)
>>> y=torch.FloatTensor(5,7,3)
# same shapes are always broadcastable (i.e. the above rules always hold)
>>> x=torch.FloatTensor()
>>> y=torch.FloatTensor(2,2)
# x and y are not broadcastable, because x does not have at least 1 dimension
# can line up trailing dimensions
>>> x=torch.FloatTensor(5,3,4,1)
>>> y=torch.FloatTensor( 3,1,1)
# x and y are broadcastable.
# 1st trailing dimension: both have size 1
# 2nd trailing dimension: y has size 1
# 3rd trailing dimension: x size == y size
# 4th trailing dimension: y dimension doesn't exist
# but:
>>> x=torch.FloatTensor(5,2,4,1)
>>> y=torch.FloatTensor( 3,1,1)
# x and y are not broadcastable, because in the 3rd trailing dimension 2 != 3
If two tensors :attr:`x`, :attr:`y` are "broadcastable", the resulting tensor size
is calculated as follows:
- If the number of dimensions of :attr:`x` and :attr:`y` are not equal, prepend 1
to the dimensions of the tensor with fewer dimensions to make them equal length.
- Then, for each dimension size, the resulting dimension size is the max of the sizes of
:attr:`x` and :attr:`y` along that dimension.
For Example::
# can line up trailing dimensions to make reading easier
>>> x=torch.FloatTensor(5,1,4,1)
>>> y=torch.FloatTensor( 3,1,1)
>>> (x+y).size()
torch.Size([5, 3, 4, 1])
# but not necessary:
>>> x=torch.FloatTensor(1)
>>> y=torch.FloatTensor(3,1,7)
>>> (x+y).size()
torch.Size([3, 1, 7])
>>> x=torch.FloatTensor(5,2,4,1)
>>> y=torch.FloatTensor(3,1,1)
>>> (x+y).size()
RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1
In-place semantics
------------------
One complication is that in-place operations do not allow the in-place tensor to change shape
as a result of the broadcast.
For Example::
>>> x=torch.FloatTensor(5,3,4,1)
>>> y=torch.FloatTensor(3,1,1)
>>> (x.add_(y)).size()
torch.Size([5, 3, 4, 1])
# but:
>>> x=torch.FloatTensor(1,3,1)
>>> y=torch.FloatTensor(3,1,7)
>>> (x.add_(y)).size()
RuntimeError: The expanded size of the tensor (1) must match the existing size (7) at non-singleton dimension 2.
Backwards compatibility
-----------------------
Prior versions of PyTorch allowed certain pointwise functions to execute on tensors with different shapes,
as long as the number of elements in each tensor was equal. The pointwise operation would then be carried
out by viewing each tensor as 1-dimensional. PyTorch now supports broadcasting and the "1-dimensional"
pointwise behavior is considered deprecated and will generate a Python warning in cases where tensors are
not broadcastable, but have the same number of elements.
Note that the introduction of broadcasting can cause backwards incompatible changes in the case where
two tensors do not have the same shape, but are broadcastable and have the same number of elements.
For Example::
>>> torch.add(torch.ones(4,1), torch.randn(4))
would previously produce a Tensor with size: torch.Size([4,1]), but now produces a Tensor with size: torch.Size([4,4]).
In order to help identify cases in your code where backwards incompatibilities introduced by broadcasting may exist,
you may set `torch.utils.backcompat.broadcast_warning.enabled` to `True`, which will generate a python warning
in such cases.
For Example::
>>> torch.utils.backcompat.broadcast_warning.enabled=True
>>> torch.add(torch.ones(4,1), torch.ones(4))
__main__:1: UserWarning: self and other do not have the same shape, but are broadcastable, and have the same number of elements.
Changing behavior in a backwards incompatible manner to broadcasting rather than viewing as 1-dimensional.

View File

@ -0,0 +1,83 @@
.. _cuda-semantics:
CUDA semantics
==============
:mod:`torch.cuda` keeps track of currently selected GPU, and all CUDA tensors
you allocate will be created on it. The selected device can be changed with a
:any:`torch.cuda.device` context manager.
However, once a tensor is allocated, you can do operations on it irrespectively
of your selected device, and the results will be always placed in on the same
device as the tensor.
Cross-GPU operations are not allowed by default, with the only exception of
:meth:`~torch.Tensor.copy_`. Unless you enable peer-to-peer memory accesses,
any attempts to launch ops on tensors spread across different devices will
raise an error.
Below you can find a small example showcasing this::
x = torch.cuda.FloatTensor(1)
# x.get_device() == 0
y = torch.FloatTensor(1).cuda()
# y.get_device() == 0
with torch.cuda.device(1):
# allocates a tensor on GPU 1
a = torch.cuda.FloatTensor(1)
# transfers a tensor from CPU to GPU 1
b = torch.FloatTensor(1).cuda()
# a.get_device() == b.get_device() == 1
c = a + b
# c.get_device() == 1
z = x + y
# z.get_device() == 0
# even within a context, you can give a GPU id to the .cuda call
d = torch.randn(2).cuda(2)
# d.get_device() == 2
Best practices
--------------
Use pinned memory buffers
^^^^^^^^^^^^^^^^^^^^^^^^^
.. warning:
This is an advanced tip. You overuse of pinned memory can cause serious
problems if you'll be running low on RAM, and you should be aware that
pinning is often an expensive operation.
Host to GPU copies are much faster when they originate from pinned (page-locked)
memory. CPU tensors and storages expose a :meth:`~torch.Tensor.pin_memory`
method, that returns a copy of the object, with data put in a pinned region.
Also, once you pin a tensor or storage, you can use asynchronous GPU copies.
Just pass an additional ``async=True`` argument to a :meth:`~torch.Tensor.cuda`
call. This can be used to overlap data transfers with computation.
You can make the :class:`~torch.utils.data.DataLoader` return batches placed in
pinned memory by passing ``pin_memory=True`` to its constructor.
.. _cuda-nn-dataparallel-instead:
Use nn.DataParallel instead of multiprocessing
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Most use cases involving batched input and multiple GPUs should default to using
:class:`~torch.nn.DataParallel` to utilize more than one GPU. Even with the GIL,
a single python process can saturate multiple GPUs.
As of version 0.1.9, large numbers of GPUs (8+) might not be fully utilized.
However, this is a known issue that is under active development. As always,
test your use case.
There are significant caveats to using CUDA models with
:mod:`~torch.multiprocessing`; unless care is taken to meet the data handling
requirements exactly, it is likely that your program will have incorrect or
undefined behavior.

View File

@ -0,0 +1,181 @@
Extending PyTorch
=================
In this note we'll cover ways of extending :mod:`torch.nn`,
:mod:`torch.autograd`, and writing custom C extensions utilizing our C
libraries.
Extending :mod:`torch.autograd`
-------------------------------
.. currentmodule:: torch.autograd
Adding operations to :mod:`~torch.autograd` requires implementing a new
:class:`Function` subclass for each operation. Recall that :class:`Function` s
are what :mod:`~torch.autograd` uses to compute the results and gradients, and
encode the operation history. Every new function requires you to implement 2
methods:
- :meth:`~Function.forward` - the code that performs the operation. It can take
as many arguments as you want, with some of them being optional, if you
specify the default values. All kinds of Python objects are accepted here.
:class:`Variable` arguments will be converted to :class:`Tensor` s before the
call, and their use will be registered in the graph. Note that this logic won't
traverse lists/dicts/any other data structures and will only consider Variables
that are direct arguments to the call. You can return either a single
:class:`Tensor` output, or a :class:`tuple` of :class:`Tensor` s if there are
multiple outputs. Also, please refer to the docs of :class:`Function` to find
descriptions of useful methods that can be called only from :meth:`~Function.forward`.
- :meth:`~Function.backward` - gradient formula. It will be given
as many :class:`Variable` arguments as there were outputs, with each of them
representing gradient w.r.t. that output. It should return as many
:class:`Variable` s as there were inputs, with each of them containing the
gradient w.r.t. its corresponding input. If your inputs didn't require
gradient (see :attr:`~Variable.needs_input_grad`), or were non-:class:`Variable`
objects, you can return :class:`python:None`. Also, if you have optional
arguments to :meth:`~Variable.forward` you can return more gradients than there
were inputs, as long as they're all :any:`python:None`.
Below you can find code for a ``Linear`` function from :mod:`torch.nn`, with
additional comments::
# Inherit from Function
class Linear(Function):
# Note that both forward and backward are @staticmethods
@staticmethod
# bias is an optional argument
def forward(ctx, input, weight, bias=None):
ctx.save_for_backward(input, weight, bias)
output = input.mm(weight.t())
if bias is not None:
output += bias.unsqueeze(0).expand_as(output)
return output
# This function has only a single output, so it gets only one gradient
@staticmethod
def backward(ctx, grad_output):
# This is a pattern that is very convenient - at the top of backward
# unpack saved_tensors and initialize all gradients w.r.t. inputs to
# None. Thanks to the fact that additional trailing Nones are
# ignored, the return statement is simple even when the function has
# optional inputs.
input, weight, bias = ctx.saved_variables
grad_input = grad_weight = grad_bias = None
# These needs_input_grad checks are optional and there only to
# improve efficiency. If you want to make your code simpler, you can
# skip them. Returning gradients for inputs that don't require it is
# not an error.
if self.needs_input_grad[0]:
grad_input = grad_output.mm(weight)
if self.needs_input_grad[1]:
grad_weight = grad_output.t().mm(input)
if bias is not None and self.needs_input_grad[2]:
grad_bias = grad_output.sum(0).squeeze(0)
return grad_input, grad_weight, grad_bias
Now, to make it easier to use these custom ops, we recommend aliasing their
``apply`` method::
linear = Linear.aply
Here, we give an additional example of a function that is parametrized by
non-Variable arguments::
class MulConstant(Function):
@staticmethod
def forward(ctx, tensor, constant):
# ctx is a context object that can be used to stash information
for backward computation
ctx.constant = constant
return tensor * constant
@staticmethod
def backward(ctx, grad_output):
# We return as many input gradients as there were arguments.
# Gradients of non-Tensor arguments to forward must be None.
return grad_output * ctx.constant, None
You probably want to check if the backward method you implemented actually
computes the derivatives of your function. It is possible by comparing with
numerical approximations using small finite differences::
from torch.autograd import gradcheck
# gradchek takes a tuple of tensor as input, check if your gradient
# evaluated with these tensors are close enough to numerical
# approximations and returns True if they all verify this condition.
input = (Variable(torch.randn(20,20).double(), requires_grad=True), Variable(torch.randn(30,20).double(), requires_grad=True),)
test = gradcheck(Linear.apply, input, eps=1e-6, atol=1e-4)
print(test)
Extending :mod:`torch.nn`
-------------------------
.. currentmodule:: torch.nn
:mod:`~torch.nn` exports two kinds of interfaces - modules and their functional
versions. You can extend it in both ways, but we recommend using modules for
all kinds of layers, that hold any parameters or buffers, and recommend using
a functional form parameter-less operations like activation functions, pooling,
etc.
Adding a functional version of an operation is already fully covered in the
section above.
Adding a :class:`Module`
^^^^^^^^^^^^^^^^^^^^^^^^
Since :mod:`~torch.nn` heavily utilizes :mod:`~torch.autograd`, adding a new
:class:`Module` requires implementing a :class:`~torch.autograd.Function`
that performs the operation and can compute the gradient. From now on let's
assume that we want to implement a ``Linear`` module and we have the function
implementated as in the listing above. There's very little code required to
add this. Now, there are two functions that need to be implemented:
- ``__init__`` (*optional*) - takes in arguments such as kernel sizes, numbers
of features, etc. and initializes parameters and buffers.
- :meth:`~Module.forward` - instantiates a :class:`~torch.autograd.Function` and
uses it to perform the operation. It's very similar to a functional wrapper
shown above.
This is how a ``Linear`` module can be implemented::
class Linear(nn.Module):
def __init__(self, input_features, output_features, bias=True):
self.input_features = input_features
self.output_features = output_features
# nn.Parameter is a special kind of Variable, that will get
# automatically registered as Module's parameter once it's assigned
# as an attribute. Parameters and buffers need to be registered, or
# they won't appear in .parameters() (doesn't apply to buffers), and
# won't be converted when e.g. .cuda() is called. You can use
# .register_buffer() to register buffers.
# nn.Parameters can never be volatile and, different than Variables,
# they require gradients by default.
self.weight = nn.Parameter(torch.Tensor(input_features, output_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(output_features))
else:
# You should always register all possible parameters, but the
# optional ones can be None if you want.
self.register_parameter('bias', None)
# Not a very smart way to initialize weights
self.weight.data.uniform_(-0.1, 0.1)
if bias is not None:
self.bias.data.uniform_(-0.1, 0.1)
def forward(self, input):
# See the autograd section for explanation of what happens here.
return Linear()(input, self.weight, self.bias)
Writing custom C extensions
---------------------------
Coming soon. For now you can find an example at
`GitHub <https://github.com/pytorch/extension-ffi>`_.

View File

@ -0,0 +1,124 @@
Multiprocessing best practices
==============================
:mod:`torch.multiprocessing` is a drop in replacement for Python's
:mod:`python:multiprocessing` module. It supports the exact same operations,
but extends it, so that all tensors sent through a
:class:`python:multiprocessing.Queue`, will have their data moved into shared
memory and will only send a handle to another process.
.. note::
When a :class:`~torch.autograd.Variable` is sent to another process, both
the :attr:`Variable.data` and :attr:`Variable.grad.data` are going to be
shared.
This allows to implement various training methods, like Hogwild, A3C, or any
others that require asynchronous operation.
Sharing CUDA tensors
--------------------
Sharing CUDA tensors between processes is supported only in Python 3, using
a ``spawn`` or ``forkserver`` start methods. :mod:`python:multiprocessing` in
Python 2 can only create subprocesses using ``fork``, and it's not supported
by the CUDA runtime.
.. warning::
CUDA API requires that the allocation exported to other processes remains
valid as long as it's used by them. You should be careful and ensure that
CUDA tensors you shared don't go out of scope as long as it's necessary.
This shouldn't be a problem for sharing model parameters, but passing other
kinds of data should be done with care. Note that this restriction doesn't
apply to shared CPU memory.
See also: :ref:`cuda-nn-dataparallel-instead`
Best practices and tips
-----------------------
Avoiding and fighting deadlocks
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
There are a lot of things that can go wrong when a new process is spawned, with
the most common cause of deadlocks being background threads. If there's any
thread that holds a lock or imports a module, and ``fork`` is called, it's very
likely that the subprocess will be in a corrupted state and will deadlock or
fail in a different way. Note that even if you don't, Python built in
libraries do - no need to look further than :mod:`python:multiprocessing`.
:class:`python:multiprocessing.Queue` is actually a very complex class, that
spawns multiple threads used to serialize, send and receive objects, and they
can cause aforementioned problems too. If you find yourself in such situation
try using a :class:`~python:multiprocessing.queues.SimpleQueue`, that doesn't
use any additional threads.
We're trying our best to make it easy for you and ensure these deadlocks don't
happen but some things are out of our control. If you have any issues you can't
cope with for a while, try reaching out on forums, and we'll see if it's an
issue we can fix.
Reuse buffers passed through a Queue
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Remember that each time you put a :class:`~torch.Tensor` into a
:class:`python:multiprocessing.Queue`, it has to be moved into shared memory.
If it's already shared, it is a no-op, otherwise it will incur an additional
memory copy that can slow down the whole process. Even if you have a pool of
processes sending data to a single one, make it send the buffers back - this
is nearly free and will let you avoid a copy when sending next batch.
Asynchronous multiprocess training (e.g. Hogwild)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Using :mod:`torch.multiprocessing`, it is possible to train a model
asynchronously, with parameters either shared all the time, or being
periodically synchronized. In the first case, we recommend sending over the whole
model object, while in the latter, we advise to only send the
:meth:`~torch.nn.Module.state_dict`.
We recommend using :class:`python:multiprocessing.Queue` for passing all kinds
of PyTorch objects between processes. It is possible to e.g. inherit the tensors
and storages already in shared memory, when using the ``fork`` start method,
however it is very bug prone and should be used with care, and only by advanced
users. Queues, even though they're sometimes a less elegant solution, will work
properly in all cases.
.. warning::
You should be careful about having global statements, that are not guarded
with an ``if __name__ == '__main__'``. If a different start method than
``fork`` is used, they will be executed in all subprocesses.
Hogwild
~~~~~~~
A concrete Hogwild implementation can be found in the `examples repository`__,
but to showcase the overall structure of the code, there's also a minimal
example below as well::
import torch.multiprocessing as mp
from model import MyModel
def train(model):
# Construct data_loader, optimizer, etc.
for data, labels in data_loader:
optimizer.zero_grad()
loss_fn(model(data), labels).backward()
optimizer.step() # This will update the shared parameters
if __name__ == '__main__':
num_processes = 4
model = MyModel()
# NOTE: this is required for the ``fork`` method to work
model.share_memory()
processes = []
for rank in range(num_processes):
p = mp.Process(target=train, args=(model,))
p.start()
processes.append(p)
for p in processes:
p.join()
.. __: https://github.com/pytorch/examples/tree/master/mnist_hogwild

View File

@ -0,0 +1,34 @@
Serialization semantics
=======================
Best practices
--------------
.. _recommend-saving-models:
Recommended approach for saving a model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
There are two main approaches for serializing and restoring a model.
The first (recommended) saves and loads only the model parameters::
torch.save(the_model.state_dict(), PATH)
Then later::
the_model = TheModelClass(*args, **kwargs)
the_model.load_state_dict(torch.load(PATH))
The second saves and loads the entire model::
torch.save(the_model, PATH)
Then later::
the_model = torch.load(PATH)
However in this case, the serialized data is bound to the specific classes
and the exact directory structure used, so it can break in various ways when
used in other projects, or after some serious refactors.

134
docs/source/optim.rst Normal file
View File

@ -0,0 +1,134 @@
torch.optim
===================================
.. automodule:: torch.optim
How to use an optimizer
-----------------------
To use :mod:`torch.optim` you have to construct an optimizer object, that will hold
the current state and will update the parameters based on the computed gradients.
Constructing it
^^^^^^^^^^^^^^^
To construct an :class:`Optimizer` you have to give it an iterable containing the
parameters (all should be :class:`~torch.autograd.Variable` s) to optimize. Then,
you can specify optimizer-specific options such as the learning rate, weight decay, etc.
Example::
optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)
optimizer = optim.Adam([var1, var2], lr = 0.0001)
Per-parameter options
^^^^^^^^^^^^^^^^^^^^^
:class:`Optimizer` s also support specifying per-parameter options. To do this, instead
of passing an iterable of :class:`~torch.autograd.Variable` s, pass in an iterable of
:class:`dict` s. Each of them will define a separate parameter group, and should contain
a ``params`` key, containing a list of parameters belonging to it. Other keys
should match the keyword arguments accepted by the optimizers, and will be used
as optimization options for this group.
.. note::
You can still pass options as keyword arguments. They will be used as
defaults, in the groups that didn't override them. This is useful when you
only want to vary a single option, while keeping all others consistent
between parameter groups.
For example, this is very useful when one wants to specify per-layer learning rates::
optim.SGD([
{'params': model.base.parameters()},
{'params': model.classifier.parameters(), 'lr': 1e-3}
], lr=1e-2, momentum=0.9)
This means that ``model.base``'s parameters will use the default learning rate of ``1e-2``,
``model.classifier``'s parameters will use a learning rate of ``1e-3``, and a momentum of
``0.9`` will be used for all parameters
Taking an optimization step
^^^^^^^^^^^^^^^^^^^^^^^^^^^
All optimizers implement a :func:`~Optimizer.step` method, that updates the
parameters. It can be used in two ways:
``optimizer.step()``
~~~~~~~~~~~~~~~~~~~~
This is a simplified version supported by most optimizers. The function can be
called once the gradients are computed using e.g.
:func:`~torch.autograd.Variable.backward`.
Example::
for input, target in dataset:
optimizer.zero_grad()
output = model(input)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
``optimizer.step(closure)``
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Some optimization algorithms such as Conjugate Gradient and LBFGS need to
reevaluate the function multiple times, so you have to pass in a closure that
allows them to recompute your model. The closure should clear the gradients,
compute the loss, and return it.
Example::
for input, target in dataset:
def closure():
optimizer.zero_grad()
output = model(input)
loss = loss_fn(output, target)
loss.backward()
return loss
optimizer.step(closure)
Algorithms
----------
.. autoclass:: Optimizer
:members:
.. autoclass:: Adadelta
:members:
.. autoclass:: Adagrad
:members:
.. autoclass:: Adam
:members:
.. autoclass:: Adamax
:members:
.. autoclass:: ASGD
:members:
.. autoclass:: LBFGS
:members:
.. autoclass:: RMSprop
:members:
.. autoclass:: Rprop
:members:
.. autoclass:: SGD
:members:
How to adjust Learning Rate
---------------------------
:mod:`torch.optim.lr_scheduler` provides several methods to adjust the learning
rate based on the number of epoches. :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`
allows dynamic learning rate reducing based on some validation measurements.
.. autoclass:: torch.optim.lr_scheduler.LambdaLR
:members:
.. autoclass:: torch.optim.lr_scheduler.StepLR
:members:
.. autoclass:: torch.optim.lr_scheduler.MultiStepLR
:members:
.. autoclass:: torch.optim.lr_scheduler.ExponentialLR
:members:
.. autoclass:: torch.optim.lr_scheduler.ReduceLROnPlateau
:members:

114
docs/source/sparse.rst Normal file
View File

@ -0,0 +1,114 @@
.. currentmodule:: torch.sparse
torch.sparse
============
.. warning::
This API is currently experimental and may change in the near future.
Torch supports sparse tensors in COO(rdinate) format, which can
efficiently store and process tensors for which the majority of elements
are zeros.
A sparse tensor is represented as a pair of dense tensors: a tensor
of values and a tensor of indices. A sparse tensor can be constructed
by providing these two tensors, as well as the size of the sparse tensor
(which cannot be inferred from these tensors!)
>>> i = torch.LongTensor([[0, 1], [2, 0]])
>>> v = torch.FloatTensor([3, 4])
>>> torch.sparse.FloatTensor(i, v, torch.Size([2,3])).to_dense()
0 0 3
4 0 0
[torch.FloatTensor of size 2x2]
You can also construct hybrid sparse tensors, where only the first n
dimensions are sparse, and the rest of the dimensions are dense.
>>> i = torch.LongTensor([[2, 4]])
>>> v = torch.FloatTensor([[1, 3], [5, 7]])
>>> torch.sparse.FloatTensor(i, v).to_dense()
0 0
0 0
1 3
0 0
5 7
[torch.FloatTensor of size 5x2]
An empty sparse tensor can be constructed by specifying its size:
>>> torch.sparse.FloatTensor(2, 3)
SparseFloatTensor of size 2x3 with indices:
[torch.LongTensor with no dimension]
and values:
[torch.FloatTensor with no dimension]
.. note::
Our sparse tensor format permits *uncoalesced* sparse tensors, where
there may be duplicate coordinates in the indices; in this case,
the interpretation is that the value at that index is the sum of all
duplicate value entries. Uncoalesced tensors permit us to implement
certain operators more efficiently.
For the most part, you shouldn't have to care whether or not a
sparse tensor is coalesced or not, as most operations will work
identically given a coalesced or uncoalesced sparse tensor.
However, there are two cases in which you may need to care.
First, if you repeatedly perform an operation that can produce
duplicate entries (e.g., :func:`torch.sparse.FloatTensor.add`), you
should occasionally coalesce your sparse tensors to prevent
them from growing too large.
Second, some operators will produce different values depending on
whether or not they are coalesced or not (e.g.,
:func:`torch.sparse.FloatTensor._values` and
:func:`torch.sparse.FloatTensor._indices`, as well as
:func:`torch.Tensor._sparse_mask`). These operators are
prefixed by an underscore to indicate that they reveal internal
implementation details and should be used with care, since code
that works with coalesced sparse tensors may not work with
uncoalesced sparse tensors; generally speaking, it is safest
to explicitly coalesce before working with these operators.
For example, suppose that we wanted to implement an operator
by operating directly on :func:`torch.sparse.FloatTensor._values`.
Multiplication by a scalar can be implemented in the obvious way,
as multiplication distributes over addition; however, square root
cannot be implemented directly, since ``sqrt(a + b) != sqrt(a) +
sqrt(b)`` (which is what would be computed if you were given an
uncoalesced tensor.)
.. class:: FloatTensor()
.. method:: add
.. method:: add_
.. method:: clone
.. method:: dim
.. method:: div
.. method:: div_
.. method:: get_device
.. method:: hspmm
.. method:: mm
.. method:: mul
.. method:: mul_
.. method:: resizeAs_
.. method:: size
.. method:: spadd
.. method:: spmm
.. method:: sspaddmm
.. method:: sspmm
.. method:: sub
.. method:: sub_
.. method:: t_
.. method:: toDense
.. method:: transpose
.. method:: transpose_
.. method:: zero_
.. method:: coalesce
.. method:: is_coalesced
.. method:: _indices
.. method:: _values
.. method:: _nnz

12
docs/source/storage.rst Normal file
View File

@ -0,0 +1,12 @@
torch.Storage
===================================
A :class:`torch.Storage` is a contiguous, one-dimensional array of a single
data type.
Every :class:`torch.Tensor` has a corresponding storage of the same data type.
.. autoclass:: torch.FloatStorage
:members:
:undoc-members:
:inherited-members:

309
docs/source/tensors.rst Normal file
View File

@ -0,0 +1,309 @@
.. currentmodule:: torch
torch.Tensor
===================================
A :class:`torch.Tensor` is a multi-dimensional matrix containing elements of
a single data type.
Torch defines seven CPU tensor types and eight GPU tensor types:
======================== =========================== ================================
Data type CPU tensor GPU tensor
======================== =========================== ================================
32-bit floating point :class:`torch.FloatTensor` :class:`torch.cuda.FloatTensor`
64-bit floating point :class:`torch.DoubleTensor` :class:`torch.cuda.DoubleTensor`
16-bit floating point :class:`torch.HalfTensor` :class:`torch.cuda.HalfTensor`
8-bit integer (unsigned) :class:`torch.ByteTensor` :class:`torch.cuda.ByteTensor`
8-bit integer (signed) :class:`torch.CharTensor` :class:`torch.cuda.CharTensor`
16-bit integer (signed) :class:`torch.ShortTensor` :class:`torch.cuda.ShortTensor`
32-bit integer (signed) :class:`torch.IntTensor` :class:`torch.cuda.IntTensor`
64-bit integer (signed) :class:`torch.LongTensor` :class:`torch.cuda.LongTensor`
======================== =========================== ================================
The :class:`torch.Tensor` constructor is an alias for the default tensor type
(:class:`torch.FloatTensor`).
A tensor can be constructed from a Python :class:`list` or sequence:
::
>>> torch.FloatTensor([[1, 2, 3], [4, 5, 6]])
1 2 3
4 5 6
[torch.FloatTensor of size 2x3]
An empty tensor can be constructed by specifying its size:
::
>>> torch.IntTensor(2, 4).zero_()
0 0 0 0
0 0 0 0
[torch.IntTensor of size 2x4]
The contents of a tensor can be accessed and modified using Python's indexing
and slicing notation:
::
>>> x = torch.FloatTensor([[1, 2, 3], [4, 5, 6]])
>>> print(x[1][2])
6.0
>>> x[0][1] = 8
>>> print(x)
1 8 3
4 5 6
[torch.FloatTensor of size 2x3]
Each tensor has an associated :class:`torch.Storage`, which holds its data.
The tensor class provides multi-dimensional, `strided <https://en.wikipedia.org/wiki/Stride_of_an_array>`_
view of a storage and defines numeric operations on it.
.. note::
Methods which mutate a tensor are marked with an underscore suffix.
For example, :func:`torch.FloatTensor.abs_` computes the absolute value
in-place and returns the modified tensor, while :func:`torch.FloatTensor.abs`
computes the result in a new tensor.
.. class:: Tensor()
Tensor(*sizes)
Tensor(size)
Tensor(sequence)
Tensor(ndarray)
Tensor(tensor)
Tensor(storage)
Creates a new tensor from an optional size or data.
If no arguments are given, an empty zero-dimensional tensor is returned.
If a :class:`numpy.ndarray`, :class:`torch.Tensor`, or :class:`torch.Storage`
is given, a new tensor that shares the same data is returned. If a Python
sequence is given, a new tensor is created from a copy of the sequence.
.. automethod:: abs
.. automethod:: abs_
.. automethod:: acos
.. automethod:: acos_
.. automethod:: add
.. automethod:: add_
.. automethod:: addbmm
.. automethod:: addbmm_
.. automethod:: addcdiv
.. automethod:: addcdiv_
.. automethod:: addcmul
.. automethod:: addcmul_
.. automethod:: addmm
.. automethod:: addmm_
.. automethod:: addmv
.. automethod:: addmv_
.. automethod:: addr
.. automethod:: addr_
.. automethod:: apply_
.. automethod:: asin
.. automethod:: asin_
.. automethod:: atan
.. automethod:: atan2
.. automethod:: atan2_
.. automethod:: atan_
.. automethod:: baddbmm
.. automethod:: baddbmm_
.. automethod:: bernoulli
.. automethod:: bernoulli_
.. automethod:: bmm
.. automethod:: byte
.. automethod:: cauchy_
.. automethod:: ceil
.. automethod:: ceil_
.. automethod:: char
.. automethod:: chunk
.. automethod:: clamp
.. automethod:: clamp_
.. automethod:: clone
.. automethod:: contiguous
.. automethod:: copy_
.. automethod:: cos
.. automethod:: cos_
.. automethod:: cosh
.. automethod:: cosh_
.. automethod:: cpu
.. automethod:: cross
.. automethod:: cuda
.. automethod:: cumprod
.. automethod:: cumsum
.. automethod:: data_ptr
.. automethod:: diag
.. automethod:: dim
.. automethod:: dist
.. automethod:: div
.. automethod:: div_
.. automethod:: dot
.. automethod:: double
.. automethod:: eig
.. automethod:: element_size
.. automethod:: eq
.. automethod:: eq_
.. automethod:: equal
.. automethod:: exp
.. automethod:: exp_
.. automethod:: expand
.. automethod:: expand_as
.. automethod:: exponential_
.. automethod:: fill_
.. automethod:: float
.. automethod:: floor
.. automethod:: floor_
.. automethod:: fmod
.. automethod:: fmod_
.. automethod:: frac
.. automethod:: frac_
.. automethod:: gather
.. automethod:: ge
.. automethod:: ge_
.. automethod:: gels
.. automethod:: geometric_
.. automethod:: geqrf
.. automethod:: ger
.. automethod:: gesv
.. automethod:: gt
.. automethod:: gt_
.. automethod:: half
.. automethod:: histc
.. automethod:: index
.. automethod:: index_add_
.. automethod:: index_copy_
.. automethod:: index_fill_
.. automethod:: index_select
.. automethod:: int
.. automethod:: inverse
.. automethod:: is_contiguous
.. autoattribute:: is_cuda
:annotation:
.. automethod:: is_pinned
.. automethod:: is_set_to
.. automethod:: is_signed
.. automethod:: kthvalue
.. automethod:: le
.. automethod:: le_
.. automethod:: lerp
.. automethod:: lerp_
.. automethod:: log
.. automethod:: log1p
.. automethod:: log1p_
.. automethod:: log_
.. automethod:: log_normal_
.. automethod:: long
.. automethod:: lt
.. automethod:: lt_
.. automethod:: map_
.. automethod:: masked_scatter_
.. automethod:: masked_fill_
.. automethod:: masked_select
.. automethod:: matmul
.. automethod:: max
.. automethod:: mean
.. automethod:: median
.. automethod:: min
.. automethod:: mm
.. automethod:: mode
.. automethod:: mul
.. automethod:: mul_
.. automethod:: multinomial
.. automethod:: mv
.. automethod:: narrow
.. automethod:: ndimension
.. automethod:: ne
.. automethod:: ne_
.. automethod:: neg
.. automethod:: neg_
.. automethod:: nelement
.. automethod:: new
.. automethod:: nonzero
.. automethod:: norm
.. automethod:: normal_
.. automethod:: numel
.. automethod:: numpy
.. automethod:: orgqr
.. automethod:: ormqr
.. automethod:: permute
.. automethod:: pin_memory
.. automethod:: potrf
.. automethod:: potri
.. automethod:: potrs
.. automethod:: pow
.. automethod:: pow_
.. automethod:: prod
.. automethod:: pstrf
.. automethod:: qr
.. automethod:: random_
.. automethod:: reciprocal
.. automethod:: reciprocal_
.. automethod:: remainder
.. automethod:: remainder_
.. automethod:: renorm
.. automethod:: renorm_
.. automethod:: repeat
.. automethod:: resize_
.. automethod:: resize_as_
.. automethod:: round
.. automethod:: round_
.. automethod:: rsqrt
.. automethod:: rsqrt_
.. automethod:: scatter_
.. automethod:: select
.. automethod:: set_
.. automethod:: share_memory_
.. automethod:: short
.. automethod:: sigmoid
.. automethod:: sigmoid_
.. automethod:: sign
.. automethod:: sign_
.. automethod:: sin
.. automethod:: sin_
.. automethod:: sinh
.. automethod:: sinh_
.. automethod:: size
.. automethod:: sort
.. automethod:: split
.. automethod:: sqrt
.. automethod:: sqrt_
.. automethod:: squeeze
.. automethod:: squeeze_
.. automethod:: std
.. automethod:: storage
.. automethod:: storage_offset
.. automethod:: storage_type
.. automethod:: stride
.. automethod:: sub
.. automethod:: sub_
.. automethod:: sum
.. automethod:: svd
.. automethod:: symeig
.. automethod:: t
.. automethod:: t_
.. automethod:: tan
.. automethod:: tan_
.. automethod:: tanh
.. automethod:: tanh_
.. automethod:: tolist
.. automethod:: topk
.. automethod:: trace
.. automethod:: transpose
.. automethod:: transpose_
.. automethod:: tril
.. automethod:: tril_
.. automethod:: triu
.. automethod:: triu_
.. automethod:: trtrs
.. automethod:: trunc
.. automethod:: trunc_
.. automethod:: type
.. automethod:: type_as
.. automethod:: unfold
.. automethod:: uniform_
.. automethod:: unsqueeze
.. automethod:: unsqueeze_
.. automethod:: var
.. automethod:: view
.. automethod:: view_as
.. automethod:: zero_

186
docs/source/torch.rst Normal file
View File

@ -0,0 +1,186 @@
torch
===================================
.. automodule:: torch
Tensors
----------------------------------
.. autofunction:: is_tensor
.. autofunction:: is_storage
.. autofunction:: set_default_tensor_type
.. autofunction:: numel
.. autofunction:: set_printoptions
Creation Ops
~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: eye
.. autofunction:: from_numpy
.. autofunction:: linspace
.. autofunction:: logspace
.. autofunction:: ones
.. autofunction:: rand
.. autofunction:: randn
.. autofunction:: randperm
.. autofunction:: arange
.. autofunction:: range
.. autofunction:: zeros
Indexing, Slicing, Joining, Mutating Ops
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: cat
.. autofunction:: chunk
.. autofunction:: gather
.. autofunction:: index_select
.. autofunction:: masked_select
.. autofunction:: nonzero
.. autofunction:: split
.. autofunction:: squeeze
.. autofunction:: stack
.. autofunction:: t
.. autofunction:: transpose
.. autofunction:: unbind
.. autofunction:: unsqueeze
Random sampling
----------------------------------
.. autofunction:: manual_seed
.. autofunction:: initial_seed
.. autofunction:: get_rng_state
.. autofunction:: set_rng_state
.. autodata:: default_generator
.. autofunction:: bernoulli
.. autofunction:: multinomial
.. autofunction:: normal
Serialization
----------------------------------
.. autofunction:: save
.. autofunction:: load
Parallelism
----------------------------------
.. autofunction:: get_num_threads
.. autofunction:: set_num_threads
Math operations
----------------------------------
Pointwise Ops
~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: abs
.. autofunction:: acos
.. autofunction:: add
.. autofunction:: addcdiv
.. autofunction:: addcmul
.. autofunction:: asin
.. autofunction:: atan
.. autofunction:: atan2
.. autofunction:: ceil
.. autofunction:: clamp
.. autofunction:: cos
.. autofunction:: cosh
.. autofunction:: div
.. autofunction:: exp
.. autofunction:: floor
.. autofunction:: fmod
.. autofunction:: frac
.. autofunction:: lerp
.. autofunction:: log
.. autofunction:: log1p
.. autofunction:: mul
.. autofunction:: neg
.. autofunction:: pow
.. autofunction:: reciprocal
.. autofunction:: remainder
.. autofunction:: round
.. autofunction:: rsqrt
.. autofunction:: sigmoid
.. autofunction:: sign
.. autofunction:: sin
.. autofunction:: sinh
.. autofunction:: sqrt
.. autofunction:: tan
.. autofunction:: tanh
.. autofunction:: trunc
Reduction Ops
~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: cumprod
.. autofunction:: cumsum
.. autofunction:: dist
.. autofunction:: mean
.. autofunction:: median
.. autofunction:: mode
.. autofunction:: norm
.. autofunction:: prod
.. autofunction:: std
.. autofunction:: sum
.. autofunction:: var
Comparison Ops
~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: eq
.. autofunction:: equal
.. autofunction:: ge
.. autofunction:: gt
.. autofunction:: kthvalue
.. autofunction:: le
.. autofunction:: lt
.. autofunction:: max
.. autofunction:: min
.. autofunction:: ne
.. autofunction:: sort
.. autofunction:: topk
Other Operations
~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: cross
.. autofunction:: diag
.. autofunction:: histc
.. autofunction:: renorm
.. autofunction:: trace
.. autofunction:: tril
.. autofunction:: triu
BLAS and LAPACK Operations
~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: addbmm
.. autofunction:: addmm
.. autofunction:: addmv
.. autofunction:: addr
.. autofunction:: baddbmm
.. autofunction:: bmm
.. autofunction:: btrifact
.. autofunction:: btrisolve
.. autofunction:: dot
.. autofunction:: eig
.. autofunction:: gels
.. autofunction:: geqrf
.. autofunction:: ger
.. autofunction:: gesv
.. autofunction:: inverse
.. autofunction:: matmul
.. autofunction:: mm
.. autofunction:: mv
.. autofunction:: orgqr
.. autofunction:: ormqr
.. autofunction:: potrf
.. autofunction:: potri
.. autofunction:: potrs
.. autofunction:: pstrf
.. autofunction:: qr
.. autofunction:: svd
.. autofunction:: symeig
.. autofunction:: trtrs

View File

@ -0,0 +1,112 @@
torchvision.datasets
====================
All datasets are subclasses of :class:`torch.utils.data.Dataset`
i.e, they have ``__getitem__`` and ``__len__`` methods implemented.
Hence, they can all be passed to a :class:`torch.utils.data.DataLoader`
which can load multiple samples parallelly using ``torch.multiprocessing`` workers.
For example: ::
imagenet_data = torchvision.datasets.ImageFolder('path/to/imagenet_root/')
data_loader = torch.utils.data.DataLoader(imagenet_data,
batch_size=4,
shuffle=True,
num_workers=args.nThreads)
The following datasets are available:
.. contents:: Datasets
:local:
All the datasets have almost similar API. They all have two common arguments:
``transform`` and ``target_transform`` to transform the input and target respectively.
.. currentmodule:: torchvision.datasets
MNIST
~~~~~
.. autoclass:: MNIST
COCO
~~~~
.. note ::
These require the `COCO API to be installed`_
.. _COCO API to be installed: https://github.com/pdollar/coco/tree/master/PythonAPI
Captions
^^^^^^^^
.. autoclass:: CocoCaptions
:members: __getitem__
:special-members:
Detection
^^^^^^^^^
.. autoclass:: CocoDetection
:members: __getitem__
:special-members:
LSUN
~~~~
.. autoclass:: LSUN
:members: __getitem__
:special-members:
ImageFolder
~~~~~~~~~~~
.. autoclass:: ImageFolder
:members: __getitem__
:special-members:
Imagenet-12
~~~~~~~~~~~
This should simply be implemented with an ``ImageFolder`` dataset.
The data is preprocessed `as described
here <https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md#download-the-imagenet-dataset>`__
`Here is an
example <https://github.com/pytorch/examples/blob/27e2a46c1d1505324032b1d94fc6ce24d5b67e97/imagenet/main.py#L48-L62>`__.
CIFAR
~~~~~
.. autoclass:: CIFAR10
:members: __getitem__
:special-members:
STL10
~~~~~
.. autoclass:: STL10
:members: __getitem__
:special-members:
SVHN
~~~~~
.. autoclass:: SVHN
:members: __getitem__
:special-members:
PhotoTour
~~~~~~~~~
.. autoclass:: PhotoTour
:members: __getitem__
:special-members:

View File

@ -0,0 +1,12 @@
torchvision.models
===================
.. currentmodule:: torchvision.models
.. automodule:: torchvision.models
:members: alexnet, resnet18, resnet34, resnet50, resnet101, resnet152,
vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19,
vgg19_bn, inception_v3, squeezenet1_0, squeezenet1_1, densenet121,
densenet169, densenet201, densenet161
:undoc-members:

View File

@ -0,0 +1,8 @@
torchvision
===================
The :mod:`torchvision` package consists of popular datasets, model
architectures, and common image transformations for computer vision.
.. automodule:: torchvision
:members:

View File

@ -0,0 +1,48 @@
torchvision.transforms
======================
.. currentmodule:: torchvision.transforms
Transforms are common image transforms. They can be chained together using :class:`Compose`
.. autoclass:: Compose
Transforms on PIL.Image
-----------------------
.. autoclass:: Scale
.. autoclass:: CenterCrop
.. autoclass:: RandomCrop
.. autoclass:: RandomHorizontalFlip
.. autoclass:: RandomSizedCrop
.. autoclass:: Pad
Transforms on torch.\*Tensor
----------------------------
.. autoclass:: Normalize
:members: __call__
:special-members:
Conversion Transforms
---------------------
.. autoclass:: ToTensor
:members: __call__
:special-members:
.. autoclass:: ToPILImage
:members: __call__
:special-members:
Generic Transforms
------------------
.. autoclass:: Lambda

View File

@ -0,0 +1,9 @@
torchvision.utils
===================
.. currentmodule:: torchvision.utils
.. autofunction:: make_grid
.. autofunction:: save_image

394
setup.py
View File

@ -1,6 +1,9 @@
from setuptools import setup, Extension, distutils, Command, find_packages
import setuptools.command.build_ext
import setuptools.command.install
import setuptools.command.develop
import setuptools.command.build_py
import distutils.unixccompiler
import distutils.command.build
import distutils.command.clean
import platform
@ -9,21 +12,44 @@ import shutil
import sys
import os
# TODO: make this more robust
WITH_CUDA = os.path.exists('/Developer/NVIDIA/CUDA-7.5/include') or os.path.exists('/usr/local/cuda/include')
DEBUG = False
from tools.setup_helpers.env import check_env_flag
from tools.setup_helpers.cuda import WITH_CUDA, CUDA_HOME
from tools.setup_helpers.cudnn import WITH_CUDNN, CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR
from tools.setup_helpers.split_types import split_types
DEBUG = check_env_flag('DEBUG')
WITH_DISTRIBUTED = not check_env_flag('NO_DISTRIBUTED')
WITH_DISTRIBUTED_MW = WITH_DISTRIBUTED and check_env_flag('WITH_DISTRIBUTED_MW')
WITH_NCCL = WITH_CUDA and platform.system() != 'Darwin'
SYSTEM_NCCL = False
################################################################################
# Workaround setuptools -Wstrict-prototypes warnings
# I lifted this code from https://stackoverflow.com/a/29634231/23845
################################################################################
import distutils.sysconfig
cfg_vars = distutils.sysconfig.get_config_vars()
for key, value in cfg_vars.items():
if type(value) == str:
cfg_vars[key] = value.replace("-Wstrict-prototypes", "")
################################################################################
# Monkey-patch setuptools to compile in parallel
################################################################################
original_link = distutils.unixccompiler.UnixCCompiler.link
def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None):
def parallelCCompile(self, sources, output_dir=None, macros=None,
include_dirs=None, debug=0, extra_preargs=None,
extra_postargs=None, depends=None):
# those lines are copied from distutils.ccompiler.CCompiler directly
macros, objects, extra_postargs, pp_opts, build = self._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs)
macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
output_dir, macros, include_dirs, sources, depends, extra_postargs)
cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
# compile using a thread pool
import multiprocessing.pool
def _single_compile(obj):
src, ext = build[obj]
self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
@ -32,12 +58,23 @@ def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=N
return objects
def patched_link(self, *args, **kwargs):
_cxx = self.compiler_cxx
self.compiler_cxx = None
result = original_link(self, *args, **kwargs)
self.compiler_cxx = _cxx
return result
distutils.ccompiler.CCompiler.compile = parallelCCompile
distutils.unixccompiler.UnixCCompiler.link = patched_link
################################################################################
# Custom build commands
################################################################################
class build_deps(Command):
user_options = []
@ -52,6 +89,10 @@ class build_deps(Command):
build_all_cmd = ['bash', 'torch/lib/build_all.sh']
if WITH_CUDA:
build_all_cmd += ['--with-cuda']
if WITH_NCCL and not SYSTEM_NCCL:
build_all_cmd += ['--with-nccl']
if WITH_DISTRIBUTED:
build_all_cmd += ['--with-distributed']
if subprocess.call(build_all_cmd) != 0:
sys.exit(1)
generate_nn_wrappers()
@ -71,22 +112,81 @@ class build_module(Command):
self.run_command('build_ext')
class build_ext(setuptools.command.build_ext.build_ext):
class build_py(setuptools.command.build_py.build_py):
def run(self):
self.create_version_file()
setuptools.command.build_py.build_py.run(self)
@staticmethod
def create_version_file():
global version, cwd
print('-- Building version ' + version)
version_path = os.path.join(cwd, 'torch', 'version.py')
with open(version_path, 'w') as f:
f.write("__version__ = '{}'\n".format(version))
class develop(setuptools.command.develop.develop):
def run(self):
build_py.create_version_file()
setuptools.command.develop.develop.run(self)
class build_ext(setuptools.command.build_ext.build_ext):
def run(self):
# Print build options
if WITH_NUMPY:
print('-- Building with NumPy bindings')
else:
print('-- NumPy not found')
if WITH_CUDNN:
print('-- Detected cuDNN at ' + CUDNN_LIB_DIR + ', ' + CUDNN_INCLUDE_DIR)
else:
print('-- Not using cuDNN')
if WITH_CUDA:
print('-- Detected CUDA at ' + CUDA_HOME)
else:
print('-- Not using CUDA')
if WITH_NCCL and SYSTEM_NCCL:
print('-- Using system provided NCCL library')
elif WITH_NCCL:
print('-- Building NCCL library')
else:
print('-- Not using NCCL')
if WITH_DISTRIBUTED:
print('-- Building with distributed package ')
else:
print('-- Building without distributed package')
# cwrap depends on pyyaml, so we can't import it earlier
from tools.cwrap import cwrap
from tools.cwrap.plugins.THPPlugin import THPPlugin
from tools.cwrap.plugins.THPLongArgsPlugin import THPLongArgsPlugin
from tools.cwrap.plugins.ArgcountSortPlugin import ArgcountSortPlugin
from tools.cwrap.plugins.AutoGPU import AutoGPU
from tools.cwrap.plugins.BoolOption import BoolOption
from tools.cwrap.plugins.KwargsPlugin import KwargsPlugin
from tools.cwrap.plugins.NullableArguments import NullableArguments
from tools.cwrap.plugins.CuDNNPlugin import CuDNNPlugin
from tools.cwrap.plugins.WrapDim import WrapDim
from tools.cwrap.plugins.AssertNDim import AssertNDim
from tools.cwrap.plugins.Broadcast import Broadcast
from tools.cwrap.plugins.ProcessorSpecificPlugin import ProcessorSpecificPlugin
thp_plugin = THPPlugin()
cwrap('torch/csrc/generic/TensorMethods.cwrap', plugins=[
THPLongArgsPlugin(), THPPlugin(), ArgcountSortPlugin(), AutoGPU()
ProcessorSpecificPlugin(), BoolOption(), thp_plugin,
AutoGPU(condition='IS_CUDA'), ArgcountSortPlugin(), KwargsPlugin(),
AssertNDim(), WrapDim(), Broadcast()
])
cwrap('torch/csrc/cudnn/cuDNN.cwrap', plugins=[
CuDNNPlugin(), NullableArguments()
])
# It's an old-style class in Python 2.7...
setuptools.command.build_ext.build_ext.run(self)
class build(distutils.command.build.build):
sub_commands = [
('build_deps', lambda self: True),
@ -94,6 +194,7 @@ class build(distutils.command.build.build):
class install(setuptools.command.install.install):
def run(self):
if not self.skip_build:
self.run_command('build_deps')
@ -101,27 +202,33 @@ class install(setuptools.command.install.install):
class clean(distutils.command.clean.clean):
def run(self):
import glob
with open('.gitignore', 'r') as f:
ignores = f.read()
for glob in filter(bool, ignores.split('\n')):
shutil.rmtree(glob, ignore_errors=True)
for wildcard in filter(bool, ignores.split('\n')):
for filename in glob.glob(wildcard):
try:
os.remove(filename)
except OSError:
shutil.rmtree(filename, ignore_errors=True)
# It's an old-style class in Python 2.7...
distutils.command.clean.clean.run(self)
################################################################################
# Configure compile flags
################################################################################
include_dirs = []
library_dirs = []
extra_link_args = []
extra_compile_args = ['-std=c++11', '-Wno-write-strings']
if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
print('PYTORCH_BINARY_BUILD found. Static linking libstdc++ on Linux')
extra_compile_args += ['-static-libstdc++']
extra_link_args += ['-static-libstdc++']
extra_compile_args = ['-std=c++11', '-Wno-write-strings',
# Python 2.6 requires -fno-strict-aliasing, see
# http://legacy.python.org/dev/peps/pep-3123/
'-fno-strict-aliasing']
cwd = os.path.dirname(os.path.abspath(__file__))
lib_path = os.path.join(cwd, "torch", "lib")
@ -132,55 +239,170 @@ include_dirs += [
os.path.join(cwd, "torch", "csrc"),
tmp_install_path + "/include",
tmp_install_path + "/include/TH",
tmp_install_path + "/include/THPP",
tmp_install_path + "/include/THNN",
tmp_install_path + "/include/ATen",
]
extra_link_args.append('-L' + lib_path)
library_dirs.append(lib_path)
main_libraries = ['TH', 'shm']
# we specify exact lib names to avoid conflict with lua-torch installs
TH_LIB = os.path.join(lib_path, 'libTH.so.1')
THS_LIB = os.path.join(lib_path, 'libTHS.so.1')
THC_LIB = os.path.join(lib_path, 'libTHC.so.1')
THCS_LIB = os.path.join(lib_path, 'libTHCS.so.1')
THNN_LIB = os.path.join(lib_path, 'libTHNN.so.1')
THCUNN_LIB = os.path.join(lib_path, 'libTHCUNN.so.1')
THPP_LIB = os.path.join(lib_path, 'libTHPP.so.1')
ATEN_LIB = os.path.join(lib_path, 'libATen.so.1')
GLOO_LIB = os.path.join(lib_path, 'libgloo.a')
GLOO_CUDA_LIB = os.path.join(lib_path, 'libgloo_cuda.a')
THD_LIB = os.path.join(lib_path, 'libTHD.a')
NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1')
if platform.system() == 'Darwin':
TH_LIB = os.path.join(lib_path, 'libTH.1.dylib')
THS_LIB = os.path.join(lib_path, 'libTHS.1.dylib')
THC_LIB = os.path.join(lib_path, 'libTHC.1.dylib')
THCS_LIB = os.path.join(lib_path, 'libTHCS.1.dylib')
THNN_LIB = os.path.join(lib_path, 'libTHNN.1.dylib')
THCUNN_LIB = os.path.join(lib_path, 'libTHCUNN.1.dylib')
THPP_LIB = os.path.join(lib_path, 'libTHPP.1.dylib')
ATEN_LIB = os.path.join(lib_path, 'libATen.1.dylib')
NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib')
if WITH_NCCL and subprocess.call('ldconfig -p | grep libnccl >/dev/null', shell=True) == 0:
SYSTEM_NCCL = True
main_compile_args = ['-D_THP_CORE']
main_libraries = ['shm']
main_link_args = [TH_LIB, THS_LIB, THPP_LIB, THNN_LIB, ATEN_LIB]
main_sources = [
"torch/csrc/PtrWrapper.cpp",
"torch/csrc/Module.cpp",
"torch/csrc/Generator.cpp",
"torch/csrc/Tensor.cpp",
"torch/csrc/Size.cpp",
"torch/csrc/Exceptions.cpp",
"torch/csrc/Storage.cpp",
"torch/csrc/DynamicTypes.cpp",
"torch/csrc/byte_order.cpp",
"torch/csrc/utils.cpp",
"torch/csrc/expand_utils.cpp",
"torch/csrc/utils/object_ptr.cpp",
"torch/csrc/utils/tuple_parser.cpp",
"torch/csrc/allocators.cpp",
"torch/csrc/serialization.cpp",
"torch/csrc/autograd/init.cpp",
"torch/csrc/autograd/engine.cpp",
"torch/csrc/autograd/function.cpp",
"torch/csrc/autograd/variable.cpp",
"torch/csrc/autograd/input_buffer.cpp",
"torch/csrc/autograd/python_function.cpp",
"torch/csrc/autograd/python_cpp_function.cpp",
"torch/csrc/autograd/python_variable.cpp",
"torch/csrc/autograd/python_engine.cpp",
"torch/csrc/autograd/python_hook.cpp",
"torch/csrc/autograd/functions/batch_normalization.cpp",
"torch/csrc/autograd/functions/convolution.cpp",
"torch/csrc/autograd/functions/basic_ops.cpp",
"torch/csrc/autograd/functions/tensor.cpp",
"torch/csrc/autograd/functions/accumulate_grad.cpp",
"torch/csrc/autograd/functions/utils.cpp",
"torch/csrc/autograd/functions/init.cpp",
"torch/csrc/nn/THNN_generic.cpp",
]
main_sources += split_types("torch/csrc/Tensor.cpp")
try:
import numpy as np
include_dirs += [np.get_include()]
extra_compile_args += ['-DWITH_NUMPY']
WITH_NUMPY = True
except ImportError:
pass
WITH_NUMPY = False
if WITH_DISTRIBUTED:
extra_compile_args += ['-DWITH_DISTRIBUTED']
main_sources += [
"torch/csrc/distributed/Module.cpp",
"torch/csrc/distributed/utils.cpp",
]
if WITH_DISTRIBUTED_MW:
main_sources += [
"torch/csrc/distributed/Tensor.cpp",
"torch/csrc/distributed/Storage.cpp",
]
extra_compile_args += ['-DWITH_DISTRIBUTED_MW']
include_dirs += [tmp_install_path + "/include/THD"]
main_link_args += [THD_LIB]
if platform.system() == 'Linux':
main_link_args += [GLOO_LIB]
if WITH_CUDA:
if platform.system() == 'Darwin':
cuda_path = '/Developer/NVIDIA/CUDA-7.5'
cuda_include_path = cuda_path + '/include'
cuda_lib_path = cuda_path + '/lib'
else:
cuda_path = '/usr/local/cuda'
cuda_include_path = cuda_path + '/include'
cuda_lib_path = cuda_path + '/lib64'
cuda_lib_dirs = ['lib64', 'lib']
cuda_include_path = os.path.join(CUDA_HOME, 'include')
for lib_dir in cuda_lib_dirs:
cuda_lib_path = os.path.join(CUDA_HOME, lib_dir)
if os.path.exists(cuda_lib_path):
break
include_dirs.append(cuda_include_path)
extra_link_args.append('-L' + cuda_lib_path)
include_dirs.append(tmp_install_path + "/include/THCUNN")
library_dirs.append(cuda_lib_path)
extra_link_args.append('-Wl,-rpath,' + cuda_lib_path)
extra_compile_args += ['-DWITH_CUDA']
main_libraries += ['THC']
extra_compile_args += ['-DCUDA_LIB_PATH=' + cuda_lib_path]
main_libraries += ['cudart', 'nvToolsExt']
main_link_args += [THC_LIB, THCS_LIB, THCUNN_LIB]
if platform.system() == 'Linux':
main_link_args += [GLOO_CUDA_LIB]
main_sources += [
"torch/csrc/cuda/Module.cpp",
"torch/csrc/cuda/Storage.cpp",
"torch/csrc/cuda/Tensor.cpp",
"torch/csrc/cuda/Stream.cpp",
"torch/csrc/cuda/AutoGPU.cpp",
"torch/csrc/cuda/utils.cpp",
"torch/csrc/cuda/expand_utils.cpp",
"torch/csrc/cuda/serialization.cpp",
]
main_sources += split_types("torch/csrc/cuda/Tensor.cpp")
if WITH_NCCL:
if SYSTEM_NCCL:
main_libraries += ['nccl']
else:
main_link_args += [NCCL_LIB]
extra_compile_args += ['-DWITH_NCCL']
if WITH_CUDNN:
main_libraries += ['cudnn']
include_dirs.append(CUDNN_INCLUDE_DIR)
library_dirs.append(CUDNN_LIB_DIR)
main_sources += [
"torch/csrc/cudnn/BatchNorm.cpp",
"torch/csrc/cudnn/Conv.cpp",
"torch/csrc/cudnn/cuDNN.cpp",
"torch/csrc/cudnn/GridSampler.cpp",
"torch/csrc/cudnn/AffineGridGenerator.cpp",
"torch/csrc/cudnn/Types.cpp",
"torch/csrc/cudnn/Handles.cpp",
]
extra_compile_args += ['-DWITH_CUDNN']
if DEBUG:
extra_compile_args += ['-O0', '-g']
extra_link_args += ['-O0', '-g']
if os.getenv('PYTORCH_BINARY_BUILD') and platform.system() == 'Linux':
print('PYTORCH_BINARY_BUILD found. Static linking libstdc++ on Linux')
# get path of libstdc++ and link manually.
# for reasons unknown, -static-libstdc++ doesn't fully link some symbols
CXXNAME = os.getenv('CXX', 'g++')
STDCPP_LIB = subprocess.check_output([CXXNAME, '-print-file-name=libstdc++.a'])
STDCPP_LIB = STDCPP_LIB[:-1]
if type(STDCPP_LIB) != str: # python 3
STDCPP_LIB = STDCPP_LIB.decode(sys.stdout.encoding)
main_link_args += [STDCPP_LIB]
version_script = os.path.abspath("tools/pytorch.version")
extra_link_args += ['-Wl,--version-script=' + version_script]
def make_relative_rpath(path):
if platform.system() == 'Darwin':
@ -193,51 +415,85 @@ def make_relative_rpath(path):
################################################################################
extensions = []
packages = find_packages(exclude=('tools.*', 'torch.cuda', 'torch.legacy.cunn'))
packages = find_packages(exclude=('tools', 'tools.*',))
C = Extension("torch._C",
libraries=main_libraries,
sources=main_sources,
language='c++',
extra_compile_args=extra_compile_args,
include_dirs=include_dirs,
extra_link_args=extra_link_args + [make_relative_rpath('lib')]
)
libraries=main_libraries,
sources=main_sources,
language='c++',
extra_compile_args=main_compile_args + extra_compile_args,
include_dirs=include_dirs,
library_dirs=library_dirs,
extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('lib')],
)
extensions.append(C)
DL = Extension("torch._dl",
sources=["torch/csrc/dl.c"],
language='c',
)
extensions.append(DL)
THNN = Extension("torch._thnn._THNN",
libraries=['TH', 'THNN'],
sources=['torch/csrc/nn/THNN.cpp'],
language='c++',
extra_compile_args=extra_compile_args,
include_dirs=include_dirs,
extra_link_args=extra_link_args + [make_relative_rpath('../lib')]
)
sources=['torch/csrc/nn/THNN.cpp'],
language='c++',
extra_compile_args=extra_compile_args,
include_dirs=include_dirs,
extra_link_args=extra_link_args + [
TH_LIB,
THNN_LIB,
make_relative_rpath('../lib'),
]
)
extensions.append(THNN)
if WITH_CUDA:
THCUNN = Extension("torch._thnn._THCUNN",
libraries=['TH', 'THC', 'THCUNN'],
sources=['torch/csrc/nn/THCUNN.cpp'],
language='c++',
extra_compile_args=extra_compile_args,
include_dirs=include_dirs,
extra_link_args=extra_link_args + [make_relative_rpath('../lib')]
)
sources=['torch/csrc/nn/THCUNN.cpp'],
language='c++',
extra_compile_args=extra_compile_args,
include_dirs=include_dirs,
extra_link_args=extra_link_args + [
TH_LIB,
THC_LIB,
THCUNN_LIB,
make_relative_rpath('../lib'),
]
)
extensions.append(THCUNN)
packages += ['torch.cuda', 'torch.legacy.cunn']
setup(name="torch", version="0.1",
ext_modules=extensions,
cmdclass = {
'build': build,
'build_ext': build_ext,
'build_deps': build_deps,
'build_module': build_module,
'install': install,
'clean': clean,
},
packages=packages,
package_data={'torch': ['lib/*.so*', 'lib/*.dylib*', 'lib/*.h', 'lib/torch_shm_manager']},
install_requires=['pyyaml'],
)
version = '0.2.0'
if os.getenv('PYTORCH_BUILD_VERSION'):
assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
version = os.getenv('PYTORCH_BUILD_VERSION') \
+ '_' + os.getenv('PYTORCH_BUILD_NUMBER')
else:
try:
sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
version += '+' + sha[:7]
except subprocess.CalledProcessError:
pass
setup(name="torch", version=version,
description="Tensors and Dynamic neural networks in Python with strong GPU acceleration",
ext_modules=extensions,
cmdclass={
'build': build,
'build_py': build_py,
'build_ext': build_ext,
'build_deps': build_deps,
'build_module': build_module,
'develop': develop,
'install': install,
'clean': clean,
},
packages=packages,
package_data={'torch': [
'lib/*.so*', 'lib/*.dylib*',
'lib/torch_shm_manager',
'lib/*.h',
'lib/include/TH/*.h', 'lib/include/TH/generic/*.h',
'lib/include/THC/*.h', 'lib/include/THC/generic/*.h']},
install_requires=['pyyaml', 'numpy'],
)

View File

@ -1,10 +1,77 @@
import sys
import os
import argparse
import unittest
import warnings
import contextlib
from functools import wraps
from itertools import product
from copy import deepcopy
import torch
import torch.cuda
from torch.autograd import Variable
from torch.autograd.leaf import Leaf
torch.set_default_tensor_type('torch.DoubleTensor')
SEED = 0
SEED_SET = 0
def parse_set_seed_once():
global SEED
global SEED_SET
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('--seed', type=int, default=123)
args, remaining = parser.parse_known_args()
if SEED_SET == 0:
torch.manual_seed(args.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(args.seed)
SEED = args.seed
SEED_SET = 1
remaining = [sys.argv[0]] + remaining
return remaining
def run_tests():
remaining = parse_set_seed_once()
unittest.main(argv=remaining)
TEST_NUMPY = True
try:
import numpy
except ImportError:
TEST_NUMPY = False
TEST_SCIPY = True
try:
import scipy
except ImportError:
TEST_SCIPY = False
def skipIfNoLapack(fn):
@wraps(fn)
def wrapper(*args, **kwargs):
try:
fn(*args, **kwargs)
except Exception as e:
if 'Lapack library not found' in e.args[0]:
raise unittest.SkipTest('Compiled without Lapack')
raise
return wrapper
def suppress_warnings(fn):
def wrapper(*args, **kwargs):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fn(*args, **kwargs)
return wrapper
def get_cpu_type(t):
assert t.__module__ == 'torch.cuda'
@ -17,13 +84,13 @@ def get_gpu_type(t):
def to_gpu(obj, type_map={}):
if torch.isTensor(obj):
if torch.is_tensor(obj):
t = type_map.get(type(obj), get_gpu_type(type(obj)))
return obj.clone().type(t)
elif torch.isStorage(obj):
elif torch.is_storage(obj):
return obj.new().resize_(obj.size()).copy_(obj)
elif isinstance(obj, Variable):
assert type(obj.creator) == Leaf
assert obj.is_leaf
t = type_map.get(type(obj.data), get_gpu_type(type(obj.data)))
return Variable(obj.data.clone().type(t), requires_grad=obj.requires_grad)
elif isinstance(obj, list):
@ -34,7 +101,20 @@ def to_gpu(obj, type_map={}):
return deepcopy(obj)
@contextlib.contextmanager
def freeze_rng_state():
rng_state = torch.get_rng_state()
if torch.cuda.is_available():
cuda_rng_state = torch.cuda.get_rng_state()
yield
if torch.cuda.is_available():
torch.cuda.set_rng_state(cuda_rng_state)
torch.set_rng_state(rng_state)
def iter_indices(tensor):
if tensor.dim() == 0:
return range(0)
if tensor.dim() == 1:
return range(tensor.size(0))
return product(*(range(s) for s in tensor.size()))
@ -51,89 +131,161 @@ def is_iterable(obj):
class TestCase(unittest.TestCase):
precision = 1e-5
def setUp(self):
torch.manual_seed(SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(SEED)
def assertTensorsSlowEqual(self, x, y, prec=None, message=''):
max_err = 0
self.assertEqual(x.size(), y.size())
for index in iter_indices(x):
max_err = max(max_err, abs(x[index] - y[index]))
self.assertLessEqual(max_err, prec, message)
def safeCoalesce(self, t):
tc = t.coalesce()
value_map = {}
for idx, val in zip(t._indices().t(), t._values()):
idx_tup = tuple(idx)
if idx_tup in value_map:
value_map[idx_tup] += val
else:
value_map[idx_tup] = val.clone() if torch.is_tensor(val) else val
new_indices = sorted(list(value_map.keys()))
new_values = [value_map[idx] for idx in new_indices]
if t._values().ndimension() < 2:
new_values = t._values().new(new_values)
else:
new_values = torch.stack(new_values)
new_indices = t._indices().new(new_indices).t()
tg = t.new(new_indices, new_values, t.size())
self.assertEqual(tc._indices(), tg._indices())
self.assertEqual(tc._values(), tg._values())
return tg
def unwrapVariables(self, x, y):
if isinstance(x, Variable) and isinstance(y, Variable):
return x.data, y.data
elif isinstance(x, Variable) or isinstance(y, Variable):
raise AssertionError("cannot compare {} and {}".format(type(x), type(y)))
return x, y
def assertEqual(self, x, y, prec=None, message=''):
if prec is None:
prec = self.precision
if isinstance(x, Variable) and isinstance(y, Variable):
x = x.data
y = y.data
x, y = self.unwrapVariables(x, y)
if torch.isTensor(x) and torch.isTensor(y):
max_err = 0
super(TestCase, self).assertEqual(x.size().tolist(), y.size().tolist())
for index in iter_indices(x):
max_err = max(max_err, abs(x[index] - y[index]))
self.assertLessEqual(max_err, prec)
if torch.is_tensor(x) and torch.is_tensor(y):
def assertTensorsEqual(a, b):
super(TestCase, self).assertEqual(a.size(), b.size())
if a.numel() > 0:
b = b.type_as(a)
b = b.cuda(device=a.get_device()) if a.is_cuda else b.cpu()
# check that NaNs are in the same locations
nan_mask = a != a
self.assertTrue(torch.equal(nan_mask, b != b))
diff = a - b
diff[nan_mask] = 0
if diff.is_signed():
diff = diff.abs()
max_err = diff.max()
self.assertLessEqual(max_err, prec, message)
self.assertEqual(x.is_sparse, y.is_sparse, message)
if x.is_sparse:
x = self.safeCoalesce(x)
y = self.safeCoalesce(y)
assertTensorsEqual(x._indices(), y._indices())
assertTensorsEqual(x._values(), y._values())
else:
assertTensorsEqual(x, y)
elif type(x) == str and type(y) == str:
super(TestCase, self).assertEqual(x, y)
elif type(x) == set and type(y) == set:
super(TestCase, self).assertEqual(x, y)
elif is_iterable(x) and is_iterable(y):
super(TestCase, self).assertEqual(len(x), len(y))
for x_, y_ in zip(x, y):
self.assertEqual(x_, y_, prec, message)
else:
try:
self.assertLessEqual(abs(x - y), prec)
self.assertLessEqual(abs(x - y), prec, message)
return
except:
pass
super(TestCase, self).assertEqual(x, y)
super(TestCase, self).assertEqual(x, y, message)
def assertNotEqual(self, x, y, prec=None, message=''):
if prec is None:
prec = self.precision
x, y = self.unwrapVariables(x, y)
if torch.is_tensor(x) and torch.is_tensor(y):
if x.size() != y.size():
super(TestCase, self).assertNotEqual(x.size(), y.size())
self.assertGreater(x.numel(), 0)
y = y.type_as(x)
y = y.cuda(device=x.get_device()) if x.is_cuda else y.cpu()
nan_mask = x != x
if torch.equal(nan_mask, y != y):
diff = x - y
if diff.is_signed():
diff = diff.abs()
diff[nan_mask] = 0
max_err = diff.max()
self.assertGreaterEqual(max_err, prec, message)
elif type(x) == str and type(y) == str:
super(TestCase, self).assertNotEqual(x, y)
elif is_iterable(x) and is_iterable(y):
super(TestCase, self).assertNotEqual(x, y)
else:
try:
self.assertGreaterEqual(abs(x - y), prec, message)
return
except:
pass
super(TestCase, self).assertNotEqual(x, y, message)
def assertObjectIn(self, obj, iterable):
for elem in iterable:
if id(obj) == id(elem):
return
raise AssertionError("object not found in iterable")
if sys.version_info < (3, 2):
# assertRaisesRegexp renamed assertRaisesRegex in 3.2
assertRaisesRegex = unittest.TestCase.assertRaisesRegexp
def make_jacobian(input, num_out):
if torch.isTensor(input) or isinstance(input, Variable):
return torch.zeros(input.nElement(), num_out)
def download_file(url, binary=True):
if sys.version_info < (3,):
from urlparse import urlsplit
import urllib2
request = urllib2
error = urllib2
else:
return type(input)(make_jacobian(elem, num_out) for elem in input)
from urllib.parse import urlsplit
from urllib import request, error
filename = os.path.basename(urlsplit(url)[2])
data_dir = os.path.join(os.path.dirname(__file__), 'data')
path = os.path.join(data_dir, filename)
def iter_tensors(x):
if torch.isTensor(x):
yield x
elif isinstance(x, Variable):
yield x.data
else:
for elem in x:
for result in iter_tensors(elem):
yield result
def contiguous(input):
if torch.isTensor(input):
return input.contiguous()
elif isinstance(input, Variable):
return input.contiguous_()
else:
return type(input)(contiguous(e) for e in input)
def get_numerical_jacobian(fn, input, target):
perturbation = 1e-6
# To be able to use .view(-1) input must be contiguous
input = contiguous(input)
output_size = fn(input).numel()
jacobian = make_jacobian(target, output_size)
# It's much easier to iterate over flattened lists of tensors.
# These are reference to the same objects in jacobian, so any changes
# will be reflected in it as well.
x_tensors = [t for t in iter_tensors(target)]
j_tensors = [t for t in iter_tensors(jacobian)]
outa = torch.Tensor(output_size)
outb = torch.Tensor(output_size)
# TODO: compare structure
for x_tensor, d_tensor in zip(x_tensors, j_tensors):
flat_tensor = x_tensor.view(-1)
for i in range(flat_tensor.nElement()):
orig = flat_tensor[i]
flat_tensor[i] = orig - perturbation
outa.copy_(fn(input))
flat_tensor[i] = orig + perturbation
outb.copy_(fn(input))
flat_tensor[i] = orig
outb.add_(-1,outa).div_(2*perturbation)
d_tensor[i] = outb
return jacobian
if os.path.exists(path):
return path
try:
data = request.urlopen(url, timeout=15).read()
with open(path, 'wb' if binary else 'w') as f:
f.write(data)
return path
except error.URLError:
msg = "could not download test file '{}'".format(url)
warnings.warn(msg, RuntimeWarning)
raise unittest.SkipTest(msg)

View File

@ -2,10 +2,14 @@ import sys
import tempfile
import unittest
from copy import deepcopy
from itertools import product
import torch
import torch.cuda
from torch.autograd import Variable
from common import TestCase, to_gpu, get_numerical_jacobian, iter_tensors, contiguous
from common import TestCase, to_gpu, freeze_rng_state
from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors, contiguous
import torch.backends.cudnn
# tarfile module tries to obtain a file object name in python 3.3
if sys.version_info[:2] == (3, 3):
@ -13,15 +17,10 @@ if sys.version_info[:2] == (3, 3):
else:
TemporaryFile = tempfile.TemporaryFile
try:
import torch.cuda
import torch.legacy.cunn
import torch.nn.cuda
TEST_CUDA = True
except Exception:
# TODO: catch ImportError once it works with "setup.py develop"
TEST_CUDA = False
TEST_CUDA = torch.cuda.is_available()
TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.cuda.FloatTensor(1))
TEST_CUDNN_VERSION = TEST_CUDNN and torch.backends.cudnn.version()
PRECISION = 1e-5
module_tests = [
@ -29,7 +28,14 @@ module_tests = [
module_name='Linear',
constructor_args=(10, 8),
input_size=(4, 10),
reference_fn=lambda i,p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
reference_fn=lambda i, p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
),
dict(
module_name='Linear',
constructor_args=(10, 8, False),
input_size=(4, 10),
desc='no_bias',
reference_fn=lambda i, p: torch.mm(i, p[0].t())
),
dict(
module_name='Threshold',
@ -47,17 +53,31 @@ module_tests = [
dict(
module_name='ReLU',
input_size=(2, 3, 4, 5),
check_inplace=True
check_inplace=True,
),
dict(
module_name='ReLU6',
input_size=(2, 3, 4, 5),
check_inplace=True
check_inplace=True,
),
dict(
module_name='RReLU',
input_size=(1, 2, 2),
test_cuda=False,
check_gradgrad=False,
),
dict(
module_name='RReLU',
constructor_args=(0.1, 0.9),
input_size=(4, 4, 5),
desc='with_up_down',
test_cuda=False,
check_gradgrad=False,
),
dict(
module_name='Hardtanh',
input_size=(3, 2, 5),
reference_fn=lambda i,_: i.clamp(-1, 1)
reference_fn=lambda i, _: i.clamp(-1, 1),
),
dict(
module_name='Sigmoid',
@ -70,75 +90,35 @@ module_tests = [
dict(
module_name='Softmax',
input_size=(10, 20),
reference_fn=lambda i,_: torch.exp(i).div(torch.exp(i).sum(1).expand(10, 20))
reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, True).expand(10, 20)),
),
dict(
module_name='Softmax2d',
input_size=(1, 3, 10, 20),
reference_fn=lambda i,_: torch.exp(i).div(torch.exp(i).sum(1).expandAs(i))
),
dict(
module_name='BatchNorm1d',
constructor_args=(10,),
input_size=(4, 10),
desc='affine'
),
dict(
module_name='BatchNorm1d',
constructor_args=(10, 1e-3, 0.3, False),
input_size=(4, 10),
desc='not_affine'
),
dict(
module_name='BatchNorm2d',
constructor_args=(3,),
input_size=(2, 3, 6, 6),
),
dict(
module_name='BatchNorm2d',
constructor_args=(3, 1e-3, 0.8),
input_size=(2, 3, 6, 6),
desc='momentum',
),
dict(
module_name='BatchNorm2d',
constructor_args=(3, 1e-3, 0.8, False),
input_size=(2, 3, 6, 6),
desc='no_affine',
),
dict(
module_name='BatchNorm3d',
constructor_args=(3,),
input_size=(2, 3, 4, 4, 4)
),
dict(
module_name='BatchNorm3d',
constructor_args=(3, 1e-3, 0.7),
input_size=(2, 3, 4, 4, 4),
desc='momentum'
),
dict(
module_name='BatchNorm3d',
constructor_args=(3, 1e-3, 0.7, False),
input_size=(2, 3, 4, 4, 4),
desc='no_affine'
reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, False)),
),
dict(
module_name='LogSoftmax',
input_size=(10, 20),
reference_fn=lambda i,_: torch.exp(i).div_(torch.exp(i).sum(1).expand(10, 20)).log_()
reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, True).expand(10, 20)).log_(),
),
dict(
module_name='LogSoftmax',
input_size=(1, 3, 10, 20),
reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, False)).log_(),
desc='multiparam',
),
dict(
module_name='ELU',
constructor_args=(2.,),
input_size=(3, 2, 5),
check_inplace=True
),
# TODO: reference function
dict(
module_name='Hardshrink',
constructor_args=(2.,),
input_size=(4, 3, 2, 4)
input_size=(4, 3, 2, 4),
check_gradgrad=False,
),
dict(
module_name='LeakyReLU',
@ -155,53 +135,89 @@ module_tests = [
dict(
module_name='LogSigmoid',
input_size=(2, 3, 4),
reference_fn=lambda i,_: i.sigmoid().log()
reference_fn=lambda i, _: i.sigmoid().log(),
check_gradgrad=False,
),
dict(
module_name='Softplus',
input_size=(10, 20),
reference_fn=lambda i,_: torch.log(1 + torch.exp(i))
reference_fn=lambda i, _: torch.log(1 + torch.exp(i)),
check_gradgrad=False,
),
dict(
module_name='Softplus',
constructor_args=(2,),
input_size=(10, 20),
reference_fn=lambda i,_: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
desc='beta'
reference_fn=lambda i, _: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
desc='beta',
check_gradgrad=False,
),
dict(
module_name='Softshrink',
input_size=(3, 2, 5)
input_size=(3, 2, 5),
check_gradgrad=False,
),
dict(
module_name='Softshrink',
constructor_args=(1,),
input_size=(3, 2, 5),
desc='lambda'
desc='lambda',
check_gradgrad=False,
),
dict(
module_name='CrossMapLRN2d',
constructor_args=(5, 5e-3, 1e-3, 2),
input_size=(2, 3, 6, 6)
input_size=(2, 3, 6, 6),
check_gradgrad=False,
),
dict(
module_name='PReLU',
input_size=(2, 3, 4, 5)
input_size=(2, 3, 4),
reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
desc='1d',
),
dict(
module_name='PReLU',
constructor_args=(3,),
input_size=(2, 3, 4),
desc='1d_multiparam',
reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
),
dict(
module_name='PReLU',
input_size=(2, 3, 4, 5),
desc='2d',
reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
),
dict(
module_name='PReLU',
constructor_args=(3,),
input_size=(2, 3, 4, 5),
desc='multiparam'
desc='2d_multiparam',
reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
),
dict(
module_name='PReLU',
input_size=(2, 3, 4, 5, 6),
reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
desc='3d',
),
dict(
module_name='PReLU',
constructor_args=(3,),
input_size=(2, 3, 4, 5, 6),
desc='3d_multiparam',
reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
),
dict(
module_name='Softsign',
input_size=(3, 2, 5),
reference_fn=lambda i,_: i.div(1 + torch.abs(i))
reference_fn=lambda i, _: i.div(1 + torch.abs(i)),
),
dict(
module_name='Softmin',
input_size=(10, 20)
input_size=(10, 20),
check_gradgrad=False,
),
dict(
module_name='Tanhshrink',
@ -209,19 +225,32 @@ module_tests = [
),
]
criterion_tests = [
dict(module_name='L1Loss',
input_size=(2, 3, 4),
target=torch.randn(2, 3, 4),
reference_fn=lambda i,t,_: 1./i.numel() * \
sum((a-b).abs().sum() for a,b in zip(i, t))
),
input_size=(2, 3, 4),
target=torch.randn(2, 3, 4),
reference_fn=lambda i, t, _: 1. / i.numel() *
sum((a - b).abs().sum() for a, b in zip(i, t)),
),
dict(
module_name='NLLLoss',
input=torch.rand(15, 10).log(),
target=torch.Tensor(15).uniform_().mul(10).floor().long(),
),
dict(
module_name='NLLLoss',
constructor_args=(None, False),
input=torch.rand(15, 10).log(),
target=torch.Tensor(15).uniform_().mul(10).floor().long(),
desc='no_size_average'
),
dict(
module_name='NLLLoss',
constructor_args=(None, True, 2),
input=torch.rand(15, 10).log(),
target=torch.Tensor(15).uniform_().mul(10).floor().long(),
desc='ignore_index'
),
dict(
module_name='NLLLoss',
constructor_args=(torch.rand(10),),
@ -229,113 +258,159 @@ criterion_tests = [
target=torch.Tensor(15).uniform_().mul(10).floor().long(),
desc='weights',
),
dict(
module_name='NLLLoss',
constructor_args=(torch.rand(10), True, 2),
input=torch.rand(15, 10).add(1e-2).log(),
target=torch.Tensor(15).uniform_().mul(10).floor().long(),
desc='weights_ignore_index'
),
dict(
module_name='NLLLoss',
constructor_args=(torch.rand(10), True, -1),
input=torch.rand(15, 10).add(1e-2).log(),
target=torch.Tensor(15).uniform_().mul(10 + 1).floor().long() - 1,
desc='weights_ignore_index_neg'
),
dict(
module_name='KLDivLoss',
input=torch.rand(10, 10).log(),
target=torch.rand(10, 10)
target=torch.rand(10, 10),
check_gradgrad=False,
),
dict(
module_name='MSELoss',
input=torch.randn(2, 3, 4, 5),
target=torch.randn(2, 3, 4, 5),
reference_fn=lambda i,t,_: (i-t).abs().pow(2).sum() / i.numel()
reference_fn=lambda i, t, _: (i - t).abs().pow(2).sum() / i.numel(),
check_gradgrad=False,
),
dict(
module_name='BCELoss',
input=torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
target=torch.randn(15, 10).gt(0).double()
target=torch.randn(15, 10).gt(0).double(),
check_gradgrad=False,
),
dict(
module_name='BCELoss',
constructor_args=(torch.rand(10),),
input=torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
target=torch.randn(15, 10).gt(0).double(),
desc='weights'
desc='weights',
check_gradgrad=False,
),
dict(
module_name='CELoss',
module_name='CrossEntropyLoss',
input=torch.randn(15, 10),
target=torch.Tensor(15).uniform_().mul(10).floor().long()
target=torch.Tensor(15).uniform_().mul(10).floor().long(),
check_gradgrad=False,
),
dict(
module_name='CELoss',
module_name='CrossEntropyLoss',
constructor_args=(torch.rand(10),),
input=torch.randn(15, 10),
target=torch.Tensor(15).uniform_().mul(10).floor().long(),
desc='weights'
desc='weights',
check_gradgrad=False,
),
dict(
module_name='NLLLoss2d',
input_size=(2, 3, 5, 5),
target=torch.rand(2, 5, 5).mul(3).floor().long()
target=torch.rand(2, 5, 5).mul(3).floor().long(),
),
dict(
module_name='NLLLoss2d',
constructor_args=(torch.rand(3),),
input_size=(2, 3, 5, 5),
target=torch.rand(2, 5, 5).mul(3).floor().long(),
desc='weights',
),
dict(
module_name='NLLLoss2d',
constructor_args=(None, True, 3),
input_size=(2, 3, 5, 5),
target=torch.rand(2, 5, 5).mul(4).floor().long(),
desc='ignore_index',
),
dict(
module_name='HingeEmbeddingLoss',
input=torch.rand(10),
target=torch.randn(10).gt(0).double().mul_(2).sub(1)
target=torch.randn(10).gt(0).double().mul_(2).sub(1),
check_gradgrad=False,
),
dict(
module_name='HingeEmbeddingLoss',
constructor_args=(0.5,),
input=torch.rand(10),
target=torch.randn(10).gt(0).double().mul_(2).sub(1),
desc='margin'
desc='margin',
check_gradgrad=False,
),
dict(
module_name='MultiLabelMarginLoss',
input_size=(5, 10),
target=torch.rand(5, 10).mul(10).floor()
target=torch.rand(5, 10).mul(10).floor().long(),
check_gradgrad=False,
),
dict(
module_name='MultiLabelSoftMarginLoss',
input_size=(5, 10),
target=torch.rand(5, 10).mul(2).floor()
target=torch.rand(5, 10).mul(2).floor(),
check_gradgrad=False,
),
dict(
module_name='MultiLabelSoftMarginLoss',
constructor_args=(torch.rand(10),),
input_size=(5, 10),
target=torch.rand(5, 10).mul(2).floor(),
desc='weights'
desc='weights',
check_gradgrad=False,
),
dict(
module_name='MultiMarginLoss',
input_size=(5, 10),
target=torch.rand(5).mul(8).floor()
target=torch.rand(5).mul(8).floor().long(),
check_gradgrad=False,
),
dict(
module_name='SmoothL1Loss',
input_size=(5, 10),
target=torch.randn(5, 10)
target=torch.randn(5, 10),
check_gradgrad=False,
),
dict(
module_name='SoftMarginLoss',
input_size=(5, 5),
target=torch.randn(5, 5).sign()
target=torch.randn(5, 5).sign(),
check_gradgrad=False,
),
dict(
module_name='CosineEmbeddingLoss',
input=(torch.rand(15, 10), torch.rand(15, 10)),
target=torch.randn(15).sign()
target=torch.randn(15).sign(),
check_gradgrad=False,
),
dict(
module_name='CosineEmbeddingLoss',
constructor_args=(0.7,),
input=(torch.rand(15, 10), torch.rand(15, 10)),
target=torch.randn(15).sign(),
desc='margin'
desc='margin',
check_gradgrad=False,
),
dict(
module_name='MarginRankingLoss',
input=(torch.randn(50).mul(10), torch.randn(50).mul(10)),
target=torch.randn(50).sign()
target=torch.randn(50).sign(),
check_gradgrad=False,
),
dict(
module_name='MarginRankingLoss',
constructor_args=(2,),
input=(torch.randn(50).mul(10), torch.randn(50).mul(10)),
target=torch.randn(50).sign(),
desc='margin'
desc='margin',
check_gradgrad=False,
),
]
@ -348,20 +423,25 @@ class NNTestCase(TestCase):
elif isinstance(input, list):
return [self._jacobian(elem, num_out) for elem in input]
else:
return torch.zeros(input.nElement(), num_out)
return torch.zeros(input.nelement(), num_out)
def _flatten_tensors(self, x):
if torch.isTensor(x):
return x.view(-1)
if torch.is_tensor(x):
if x.is_sparse:
return x.to_dense().view(-1)
else:
return x.view(-1)
elif isinstance(x, Variable):
return x.data.view(-1)
return self._flatten_tensors(x.data)
else:
return tuple(self._flatten_tensors(a) for a in x)
def _zero_grad_input(self, input):
if isinstance(input, Variable):
input.grad.zero_()
elif torch.isTensor(input):
if input.requires_grad and input.grad is not None:
input.grad.data.zero_()
input.grad.detach_()
elif torch.is_tensor(input):
return
else:
for i in input:
@ -374,15 +454,15 @@ class NNTestCase(TestCase):
flat_d_out = d_out.view(-1)
if jacobian_input:
jacobian_input = self._jacobian(input, d_out.nElement())
flat_jacobian_input = list(iter_tensors(jacobian_input))
jacobian_inp = self._jacobian(input, d_out.nelement())
flat_jacobian_input = list(iter_tensors(jacobian_inp))
if jacobian_parameters:
param, d_param = self._get_parameters(module)
num_param = sum(p.numel() for p in param)
jacobian_param = torch.zeros(num_param, d_out.nElement())
jacobian_param = torch.zeros(num_param, d_out.nelement())
for i in range(flat_d_out.nElement()):
for i in range(flat_d_out.nelement()):
d_out.zero_()
flat_d_out[i] = 1
@ -395,13 +475,13 @@ class NNTestCase(TestCase):
if jacobian_input:
for jacobian_x, d_x in zip(flat_jacobian_input, iter_tensors(d_input)):
jacobian_x[:,i] = d_x
jacobian_x[:, i] = d_x
if jacobian_parameters:
jacobian_param[:,i] = torch.cat(self._flatten_tensors(d_param), 0)
jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
res = tuple()
if jacobian_input:
res += jacobian_input,
res += jacobian_inp,
if jacobian_parameters:
res += jacobian_param,
@ -409,7 +489,7 @@ class NNTestCase(TestCase):
def _numerical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
output = self._forward(module, input)
output_size = output.nElement()
output_size = output.nelement()
if jacobian_parameters:
param, d_param = self._get_parameters(module)
@ -421,12 +501,11 @@ class NNTestCase(TestCase):
return out
res = tuple()
# TODO: enable non-contig tests
input = contiguous(input)
if jacobian_input:
res += get_numerical_jacobian(fw, input, input),
res += get_numerical_jacobian(fw, input, input, eps=1e-6),
if jacobian_parameters:
res += torch.cat(list(get_numerical_jacobian(fw, input, p) for p in param), 0),
res += torch.cat(list(get_numerical_jacobian(fw, input, p, eps=1e-6) for p in param), 0),
return res
def check_jacobian(self, module, input, jacobian_input=True):
@ -452,13 +531,13 @@ class NNTestCase(TestCase):
for x, d_x in zip(input_t, numerical_t):
x = x.view(-1)
d_x = d_x.view(-1)
for i in range(x.nElement()):
for i in range(x.nelement()):
original = x[i]
x[i] = original + eps
fx1 = self._forward_criterion(criterion, input, target)
x[i] = original - eps
fx2 = self._forward_criterion(criterion, input, target)
deriv = (fx1 - fx2) / (2.*eps)
deriv = (fx1 - fx2) / (2. * eps)
d_x[i] = deriv
x[i] = original
@ -472,8 +551,9 @@ class NNTestCase(TestCase):
class TestBase(object):
def __init__(self, constructor, constructor_args=tuple(), input_size=None,
input=None, desc='', reference_fn=None, fullname=None, **kwargs):
input=None, desc='', reference_fn=None, fullname=None, **kwargs):
if input_size is None and input is None:
raise RuntimeError("Specify either an input tensor, or it's size!")
self.constructor = constructor
@ -496,7 +576,7 @@ class TestBase(object):
def _unpack_input(self, input):
if isinstance(input, Variable):
return input.data
elif torch.isTensor(input):
elif torch.is_tensor(input):
return input
else:
return type(input)(self._unpack_input(i) for i in input)
@ -508,8 +588,8 @@ class TestBase(object):
def map_input_sizes(sizes):
if isinstance(sizes, list):
return [map_input_sizes(s) for s in sizes]
elif torch.isTensor(sizes):
return sizes
elif torch.is_tensor(sizes):
return sizes.double()
else:
return torch.randn(*sizes)
@ -521,6 +601,7 @@ class TestBase(object):
class ModuleTest(TestBase):
def __init__(self, *args, **kwargs):
super(ModuleTest, self).__init__(*args, **kwargs)
self.jacobian_input = kwargs.get('jacobian_input', True)
@ -538,6 +619,8 @@ class ModuleTest(TestBase):
expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0])
test_case.assertEqual(out, expected_out)
self.test_noncontig(test_case, module, input)
# TODO: do this with in-memory files as soon as torch.save will support it
with TemporaryFile() as f:
test_case._forward(module, input)
@ -548,6 +631,51 @@ class ModuleTest(TestBase):
self._do_test(test_case, module, input)
def noncontiguize(self, obj):
if isinstance(obj, list):
return [self.noncontiguize(o) for o in obj]
tensor = obj.data if isinstance(obj, Variable) else obj
ndim = tensor.dim()
noncontig = torch.stack([tensor.clone().zero_(), tensor], ndim).select(ndim, 1)
assert noncontig.numel() == 1 or not noncontig.is_contiguous()
if isinstance(obj, Variable):
return Variable(noncontig, requires_grad=obj.requires_grad)
return noncontig
def test_noncontig(self, test_case, module, input):
test_case._zero_grad_parameters(module)
test_case._zero_grad_input(input)
with freeze_rng_state():
output = test_case._forward(module, input)
grad_output = output
if isinstance(grad_output, Variable):
grad_output = grad_output.data.clone()
else:
grad_output = grad_output.clone()
output = output.clone()
grad_output.normal_()
d_input = deepcopy(test_case._backward(module, input, output, grad_output))
d_param = deepcopy(test_case._get_parameters(module)[1])
nc_input = self.noncontiguize(input)
nc_grad_output = self.noncontiguize(grad_output)
for contig_i, contig_g in product((True, False), repeat=2):
i = input if contig_i else nc_input
go = grad_output if contig_g else nc_grad_output
test_case._zero_grad_parameters(module)
test_case._zero_grad_input(i)
with freeze_rng_state():
try:
out = test_case._forward(module, i)
except Exception:
# Some modules will fail because of non contiguous inputs and we're ok with that
continue
grad = test_case._backward(module, i, out, go)
test_case.assertEqual(out, output)
test_case.assertEqual(grad, d_input, 1e-4)
test_case.assertEqual(test_case._get_parameters(module)[1], d_param)
def test_cuda(self, test_case):
if not TEST_CUDA or not self.should_test_cuda:
raise unittest.SkipTest('Excluded from CUDA tests')
@ -557,9 +685,7 @@ class ModuleTest(TestBase):
gpu_input = to_gpu(cpu_input, type_map=type_map)
cpu_module = self.constructor(*self.constructor_args)
gpu_module = self.constructor(*self.constructor_args).cuda()
test_case._zero_grad_parameters(cpu_module)
test_case._zero_grad_parameters(gpu_module)
gpu_module = self.constructor(*self.constructor_args).float().cuda()
cpu_param = test_case._get_parameters(cpu_module)
gpu_param = test_case._get_parameters(gpu_module)
for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
@ -569,6 +695,10 @@ class ModuleTest(TestBase):
gpu_p = gpu_p.data
gpu_p.copy_(cpu_p)
test_case._zero_grad_input(cpu_input)
test_case._zero_grad_input(gpu_input)
test_case._zero_grad_parameters(cpu_module)
test_case._zero_grad_parameters(gpu_module)
cpu_output = test_case._forward(cpu_module, cpu_input)
gpu_output = test_case._forward(gpu_module, gpu_input)
test_case.assertEqual(cpu_output, gpu_output, 2e-4)
@ -582,6 +712,8 @@ class ModuleTest(TestBase):
test_case.assertEqual(cpu_gradInput, gpu_gradInput, 2e-4)
for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
test_case.assertEqual(cpu_d_p, gpu_d_p, 2e-4)
self.test_noncontig(test_case, gpu_module, gpu_input)
except NotImplementedError:
pass
# TODO: remove this after CUDA scatter_ is implemented
@ -593,6 +725,7 @@ class ModuleTest(TestBase):
class CriterionTest(TestBase):
def __init__(self, *args, **kwargs):
super(CriterionTest, self).__init__(*args, **kwargs)
self.target = self._get_target(kwargs['target'])
@ -615,10 +748,11 @@ class CriterionTest(TestBase):
if isinstance(target, Variable):
target = target.data
expected_out = self.reference_fn(deepcopy(self._unpack_input(input)),
deepcopy(target), module)
deepcopy(target), module)
test_case.assertEqual(out, expected_out)
test_case.check_criterion_jacobian(module, input, self.target)
self._do_extra_tests(test_case, module, input, self.target)
def test_cuda(self, test_case):
if not TEST_CUDA or not self.should_test_cuda:
@ -627,7 +761,6 @@ class CriterionTest(TestBase):
cpu_input = self._get_input()
type_map = {
torch.DoubleTensor: torch.cuda.FloatTensor,
torch.LongTensor: torch.cuda.FloatTensor
}
gpu_input = to_gpu(cpu_input, type_map=type_map)
@ -635,15 +768,17 @@ class CriterionTest(TestBase):
gpu_target = to_gpu(self.target, type_map=type_map)
cpu_module = self.constructor(*self.constructor_args)
gpu_module = self.constructor(*self.constructor_args).cuda()
gpu_module = self.constructor(*self.constructor_args).float().cuda()
cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
test_case.assertEqual(cpu_output, gpu_output, 2e-4)
test_case.assertEqual(cpu_output, gpu_output, 4e-4)
cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target)
gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target)
test_case.assertEqual(cpu_gradInput, gpu_gradInput, 2e-4)
test_case.assertEqual(cpu_gradInput, gpu_gradInput, 4e-4)
except NotImplementedError:
pass
def _do_extra_tests(self, test_case, module, input, target):
pass

8
test/data/network1.py Normal file
View File

@ -0,0 +1,8 @@
import torch.nn as nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.linear = nn.Linear(10, 20)

9
test/data/network2.py Normal file
View File

@ -0,0 +1,9 @@
import torch.nn as nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.linear = nn.Linear(10, 20)
self.relu = nn.ReLU()

View File

@ -0,0 +1,71 @@
import torch
def check_error(desc, fn, *required_substrings):
try:
fn()
except Exception as e:
error_message = e.args[0]
print('=' * 80)
print(desc)
print('-' * 80)
print(error_message)
print('')
for sub in required_substrings:
assert sub in error_message
return
assert False, "given function ({}) didn't raise an error".format(desc)
check_error(
'Wrong argument types',
lambda: torch.FloatStorage(object()),
'object')
check_error('Unknown keyword argument',
lambda: torch.FloatStorage(content=1234.),
'keyword')
check_error('Invalid types inside a sequence',
lambda: torch.FloatStorage(['a', 'b']),
'list', 'str')
check_error('Invalid size type',
lambda: torch.FloatStorage(1.5),
'float')
check_error('Invalid offset',
lambda: torch.FloatStorage(torch.FloatStorage(2), 4),
'2', '4')
check_error('Negative offset',
lambda: torch.FloatStorage(torch.FloatStorage(2), -1),
'2', '-1')
check_error('Invalid size',
lambda: torch.FloatStorage(torch.FloatStorage(3), 1, 5),
'2', '1', '5')
check_error('Negative size',
lambda: torch.FloatStorage(torch.FloatStorage(3), 1, -5),
'2', '1', '-5')
check_error('Invalid index type',
lambda: torch.FloatStorage(10)['first item'],
'str')
def assign():
torch.FloatStorage(10)[1:-1] = '1'
check_error('Invalid value type',
assign,
'str')
check_error('resize_ with invalid type',
lambda: torch.FloatStorage(10).resize_(1.5),
'float')
check_error('fill_ with invalid type',
lambda: torch.IntStorage(10).fill_('asdf'),
'str')
# TODO: frombuffer

6
test/ffi/src/cpu/lib.h Normal file
View File

@ -0,0 +1,6 @@
void good_func(THFloatTensor *tensor, int a, float b);
void bad_func(THFloatTensor *tensor, int a, float b);
THFloatTensor * new_tensor(int a);
float int_to_float(int a);

19
test/ffi/src/cpu/lib1.c Normal file
View File

@ -0,0 +1,19 @@
#include <TH/TH.h>
void good_func(THFloatTensor *tensor, int a, float b)
{
THFloatTensor_mul(tensor, tensor, a);
THFloatTensor_add(tensor, tensor, b);
}
THFloatTensor * new_tensor(int a)
{
THFloatTensor *t = THFloatTensor_newWithSize2d(a, a);
THFloatTensor_fill(t, a);
return t;
}
float int_to_float(int a)
{
return a;
}

8
test/ffi/src/cpu/lib2.c Normal file
View File

@ -0,0 +1,8 @@
#include <TH/TH.h>
void bad_func(THFloatTensor *tensor, int a, float b)
{
THFloatTensor_mul(tensor, tensor, a);
THFloatTensor_add(tensor, tensor, b);
THFloatTensor_addbmm(tensor, 1, tensor, 1, tensor, tensor);
}

View File

@ -0,0 +1,12 @@
#include <TH/TH.h>
#include <THC/THC.h>
extern THCState *state;
#include "../cpu/lib1.c"
void cuda_func(THCudaTensor *tensor, int a, float b)
{
THCudaTensor_mul(state, tensor, tensor, a);
THCudaTensor_add(state, tensor, tensor, b);
}

View File

@ -0,0 +1,5 @@
void good_func(THFloatTensor *tensor, int a, float b);
void cuda_func(THCudaTensor *tensor, int a, float b);
THFloatTensor * new_tensor(int a);
float int_to_float(int a);

5
test/ffi/src/lib.h Normal file
View File

@ -0,0 +1,5 @@
void my_func(THFloatTensor *tensor, int a, float b);
void my_cuda_func(THCudaTensor *tensor, int a, float b);
THFloatTensor * new_t(int a);
float new_int(int a);

View File

@ -1,5 +1,5 @@
# th test.lua > lua.out
th test.lua > lua.out
python3 test.py > python.out
diff lua.out python.out >/dev/null 2>&1

File diff suppressed because it is too large Load Diff

View File

@ -1,39 +0,0 @@
assert(arg[1])
funcs = {
'resizeAs', 'add', 'zero', 'mul', 'div', 'abs',
'addcmul', 'addcdiv', 'copy', 'sqrt', 'fill',
{'cmul', 'mul'},
{'cdiv', 'div'},
}
for _, val in pairs(funcs) do
local name, newname
if type(val) == 'table' then
name = val[1]
newname = val[2]
else
name = val
newname = val .. '_'
end
command = "sed -i -r "
.. "'/torch\\." .. name .. "\\(/b; " -- short-circuits
.. "s/([a-zA-Z]*)\\." .. name .. "\\(" -- substitution
.. "/"
.. "\\1\\." .. newname .. "\\(/g' " .. arg[1]
print(command)
os.execute(command)
command = "sed -i 's/math\\." .. newname
.. "/math\\." .. name .. "/' " .. arg[1]
print(command)
os.execute(command)
end
funcs = {
{'torch\.cmul', 'torch\.mul'},
{'torch\.cdiv', 'torch\.div'},
}
for _, val in pairs(funcs) do
command = "sed -i 's/" .. val[1] .. "/" .. val[2] .. "/' " .. arg[1]
print(command)
os.execute(command)
end

33
test/optim/test.lua Normal file
View File

@ -0,0 +1,33 @@
local cjson = require 'cjson'
require 'optim'
function rosenbrock(t)
x, y = t[1], t[2]
return (1 - x) ^ 2 + 100 * (y - x^2)^2
end
function drosenbrock(t)
x, y = t[1], t[2]
return torch.DoubleTensor({-400 * x * (y - x^2) - 2 * (1 - x), 200 * x * (y - x^2)})
end
local fd = io.open('tests.json', 'r')
local tests = cjson.decode(fd:read('*a'))
fd:close()
for i, test in ipairs(tests) do
print(test.algorithm)
algorithm = optim[test.algorithm]
for i, config in ipairs(test.config) do
print('================================================================================')
params = torch.DoubleTensor({1.5, 1.5})
for i = 1, 100 do
function closure(x)
return rosenbrock(x), drosenbrock(x)
end
algorithm(closure, params, config)
print(string.format('%.8f\t%.8f', params[1], params[2]))
end
end
end

View File

@ -3,13 +3,15 @@ import torch
import torch.legacy.optim as optim
from pprint import pprint
def rosenbrock(tensor):
x, y = tensor
return (1 - x)**2 + 100 * (y - x**2)**2
return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
def drosenbrock(tensor):
x, y = tensor
return torch.DoubleTensor((-400 * x * (y - x**2) - 2 * (1 - x), 200 * x * (y - x**2)))
return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * x * (y - x ** 2)))
algorithms = {
'adadelta': optim.adadelta,
@ -22,6 +24,7 @@ algorithms = {
'rmsprop': optim.rmsprop,
'rprop': optim.rprop,
'sgd': optim.sgd,
'lbfgs': optim.lbfgs,
}
with open('tests.json', 'r') as f:
@ -35,4 +38,4 @@ for test in tests:
params = torch.DoubleTensor((1.5, 1.5))
for i in range(100):
algorithm(lambda x: (rosenbrock(x), drosenbrock(x)), params, config)
print('{:.12f}\t{:.12f}\t'.format(params[0], params[1]))
print('{:.8f}\t{:.8f}\t'.format(params[0], params[1]))

View File

@ -98,5 +98,12 @@
{"learningRate": 1e-4, "nesterov": true, "momentum": 0.95, "dampening": 0},
{"weightDecay": 0.2}
]
},
{
"algorithm": "lbfgs",
"config": [
{},
{"learningRate": 1e-1}
]
}
]

View File

@ -1,31 +1,95 @@
#!/usr/bin/env bash
set -e
PYCMD=${PYCMD:="python"}
COVERAGE=0
while [[ "$#" -gt 0 ]]; do
case "$1" in
-p|--python) PYCMD=$2; shift 2 ;;
-c|--coverage) COVERAGE=1; shift 1;;
--) shift; break ;;
*) echo "Invalid argument: $1!" ; exit 1 ;;
esac
done
if [[ $COVERAGE -eq 1 ]]; then
coverage erase
PYCMD="coverage run --parallel-mode --source torch "
echo "coverage flag found. Setting python command to: \"$PYCMD\""
fi
pushd "$(dirname "$0")"
echo "Running torch tests"
python test_torch.py
$PYCMD test_torch.py $@
echo "Running autograd tests"
python test_autograd.py
$PYCMD test_autograd.py $@
echo "Running sparse tests"
$PYCMD test_sparse.py $@
echo "Running nn tests"
python test_nn.py
$PYCMD test_nn.py $@
echo "Running legacy nn tests"
python test_legacy_nn.py
$PYCMD test_legacy_nn.py $@
echo "Running optim tests"
$PYCMD test_optim.py $@
echo "Running multiprocessing tests"
python test_multiprocessing.py
$PYCMD test_multiprocessing.py $@
MULTIPROCESSING_METHOD=spawn $PYCMD test_multiprocessing.py $@
MULTIPROCESSING_METHOD=forkserver $PYCMD test_multiprocessing.py $@
echo "Running util tests"
python test_utils.py
if which nvcc >/dev/null 2>&1
then
echo "Running cuda tests"
python test_cuda.py
$PYCMD test_utils.py $@
echo "Running dataloader tests"
$PYCMD test_dataloader.py $@
echo "Running cuda tests"
$PYCMD test_cuda.py $@
echo "Running NCCL tests"
$PYCMD test_nccl.py $@
distributed_set_up() {
export TEMP_DIR="$(mktemp -d)"
rm -rf "$TEMP_DIR/"*
mkdir "$TEMP_DIR/barrier"
mkdir "$TEMP_DIR/test_dir"
}
distributed_tear_down() {
rm -rf "$TEMP_DIR"
}
trap distributed_tear_down EXIT SIGHUP SIGINT SIGTERM
echo "Running distributed tests for the TCP backend"
distributed_set_up
BACKEND=tcp WORLD_SIZE=3 $PYCMD ./test_distributed.py
distributed_tear_down
echo "Running distributed tests for the Gloo backend"
distributed_set_up
BACKEND=gloo WORLD_SIZE=3 $PYCMD ./test_distributed.py
distributed_tear_down
if [ -x "$(command -v mpiexec)" ]; then
echo "Running distributed tests for the MPI backend"
distributed_set_up
BACKEND=mpi mpiexec -n 3 $PYCMD ./test_distributed.py
distributed_tear_down
else
echo "nvcc not found in PATH, skipping CUDA tests"
echo "Skipping MPI backend tests (MPI not found)"
fi
if [[ $COVERAGE -eq 1 ]]; then
coverage combine
coverage html
fi
popd

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

337
test/test_dataloader.py Normal file
View File

@ -0,0 +1,337 @@
import math
import sys
import torch
import traceback
import unittest
from torch.utils.data import Dataset, TensorDataset, DataLoader, ConcatDataset
from common import TestCase, run_tests, TEST_NUMPY
from common_nn import TEST_CUDA
class TestTensorDataset(TestCase):
def test_len(self):
source = TensorDataset(torch.randn(15, 10, 2, 3, 4, 5), torch.randperm(15))
self.assertEqual(len(source), 15)
def test_getitem(self):
t = torch.randn(15, 10, 2, 3, 4, 5)
l = torch.randn(15, 10)
source = TensorDataset(t, l)
for i in range(15):
self.assertEqual(t[i], source[i][0])
self.assertEqual(l[i], source[i][1])
def test_getitem_1d(self):
t = torch.randn(15)
l = torch.randn(15)
source = TensorDataset(t, l)
for i in range(15):
self.assertEqual(t[i], source[i][0])
self.assertEqual(l[i], source[i][1])
class TestConcatDataset(TestCase):
def test_concat_two_singletons(self):
result = ConcatDataset([[0], [1]])
self.assertEqual(2, len(result))
self.assertEqual(0, result[0])
self.assertEqual(1, result[1])
def test_concat_two_non_singletons(self):
result = ConcatDataset([[0, 1, 2, 3, 4],
[5, 6, 7, 8, 9]])
self.assertEqual(10, len(result))
self.assertEqual(0, result[0])
self.assertEqual(5, result[5])
def test_concat_two_non_singletons_with_empty(self):
# Adding an empty dataset somewhere is correctly handled
result = ConcatDataset([[0, 1, 2, 3, 4],
[],
[5, 6, 7, 8, 9]])
self.assertEqual(10, len(result))
self.assertEqual(0, result[0])
self.assertEqual(5, result[5])
def test_concat_raises_index_error(self):
result = ConcatDataset([[0, 1, 2, 3, 4],
[5, 6, 7, 8, 9]])
with self.assertRaises(IndexError):
# this one goes to 11
result[11]
class ErrorDataset(Dataset):
def __init__(self, size):
self.size = size
def __len__(self):
return self.size
class TestDataLoader(TestCase):
def setUp(self):
self.data = torch.randn(100, 2, 3, 5)
self.labels = torch.randperm(50).repeat(2)
self.dataset = TensorDataset(self.data, self.labels)
def _test_sequential(self, loader):
batch_size = loader.batch_size
for i, (sample, target) in enumerate(loader):
idx = i * batch_size
self.assertEqual(sample, self.data[idx:idx + batch_size])
self.assertEqual(target, self.labels[idx:idx + batch_size])
self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))
def _test_shuffle(self, loader):
found_data = {i: 0 for i in range(self.data.size(0))}
found_labels = {i: 0 for i in range(self.labels.size(0))}
batch_size = loader.batch_size
for i, (batch_samples, batch_targets) in enumerate(loader):
for sample, target in zip(batch_samples, batch_targets):
for data_point_idx, data_point in enumerate(self.data):
if data_point.eq(sample).all():
self.assertFalse(found_data[data_point_idx])
found_data[data_point_idx] += 1
break
self.assertEqual(target, self.labels[data_point_idx])
found_labels[data_point_idx] += 1
self.assertEqual(sum(found_data.values()), (i + 1) * batch_size)
self.assertEqual(sum(found_labels.values()), (i + 1) * batch_size)
self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))
def _test_error(self, loader):
it = iter(loader)
errors = 0
while True:
try:
next(it)
except NotImplementedError:
errors += 1
except StopIteration:
self.assertEqual(errors,
math.ceil(float(len(loader.dataset)) / loader.batch_size))
return
def test_sequential(self):
self._test_sequential(DataLoader(self.dataset))
def test_sequential_batch(self):
self._test_sequential(DataLoader(self.dataset, batch_size=2))
def test_growing_dataset(self):
dataset = [torch.ones(4) for _ in range(4)]
dataloader_seq = DataLoader(dataset, shuffle=False)
dataloader_shuffle = DataLoader(dataset, shuffle=True)
dataset.append(torch.ones(4))
self.assertEqual(len(dataloader_seq), 5)
self.assertEqual(len(dataloader_shuffle), 5)
@unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
def test_sequential_pin_memory(self):
loader = DataLoader(self.dataset, batch_size=2, pin_memory=True)
for input, target in loader:
self.assertTrue(input.is_pinned())
self.assertTrue(target.is_pinned())
def test_shuffle(self):
self._test_shuffle(DataLoader(self.dataset, shuffle=True))
def test_shuffle_batch(self):
self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True))
def test_sequential_workers(self):
self._test_sequential(DataLoader(self.dataset, num_workers=4))
def test_seqential_batch_workers(self):
self._test_sequential(DataLoader(self.dataset, batch_size=2, num_workers=4))
def test_shuffle_workers(self):
self._test_shuffle(DataLoader(self.dataset, shuffle=True, num_workers=4))
def test_shuffle_batch_workers(self):
self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4))
def _test_batch_sampler(self, **kwargs):
# [(0, 1), (2, 3, 4), (5, 6), (7, 8, 9), ...]
batches = []
for i in range(0, 100, 5):
batches.append(tuple(range(i, i + 2)))
batches.append(tuple(range(i + 2, i + 5)))
dl = DataLoader(self.dataset, batch_sampler=batches, **kwargs)
self.assertEqual(len(dl), 40)
for i, (input, _target) in enumerate(dl):
if i % 2 == 0:
offset = i * 5 // 2
self.assertEqual(len(input), 2)
self.assertEqual(input, self.data[offset:offset + 2])
else:
offset = i * 5 // 2
self.assertEqual(len(input), 3)
self.assertEqual(input, self.data[offset:offset + 3])
def test_batch_sampler(self):
self._test_batch_sampler()
self._test_batch_sampler(num_workers=4)
@unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
def test_shuffle_pin_memory(self):
loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)
for input, target in loader:
self.assertTrue(input.is_pinned())
self.assertTrue(target.is_pinned())
@unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
def test_numpy(self):
import numpy as np
class TestDataset(torch.utils.data.Dataset):
def __getitem__(self, i):
return np.ones((2, 3, 4)) * i
def __len__(self):
return 1000
loader = DataLoader(TestDataset(), batch_size=12)
batch = next(iter(loader))
self.assertIsInstance(batch, torch.DoubleTensor)
self.assertEqual(batch.size(), torch.Size([12, 2, 3, 4]))
def test_error(self):
self._test_error(DataLoader(ErrorDataset(100), batch_size=2, shuffle=True))
def test_error_workers(self):
self._test_error(DataLoader(ErrorDataset(41), batch_size=2, shuffle=True, num_workers=4))
@unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
def test_partial_workers(self):
"check that workers exit even if the iterator is not exhausted"
loader = iter(DataLoader(self.dataset, batch_size=2, num_workers=4, pin_memory=True))
workers = loader.workers
pin_thread = loader.pin_thread
for i, sample in enumerate(loader):
if i == 3:
break
del loader
for w in workers:
w.join(1.0) # timeout of one second
self.assertFalse(w.is_alive(), 'subprocess not terminated')
self.assertEqual(w.exitcode, 0)
pin_thread.join(1.0)
self.assertFalse(pin_thread.is_alive())
def test_len(self):
def check_len(dl, expected):
self.assertEqual(len(dl), expected)
n = 0
for sample in dl:
n += 1
self.assertEqual(n, expected)
check_len(self.dataset, 100)
check_len(DataLoader(self.dataset, batch_size=2), 50)
check_len(DataLoader(self.dataset, batch_size=3), 34)
@unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
def test_numpy_scalars(self):
import numpy as np
class ScalarDataset(torch.utils.data.Dataset):
def __init__(self, dtype):
self.dtype = dtype
def __getitem__(self, i):
return self.dtype()
def __len__(self):
return 4
dtypes = {
np.float64: torch.DoubleTensor,
np.float32: torch.FloatTensor,
np.float16: torch.HalfTensor,
np.int64: torch.LongTensor,
np.int32: torch.IntTensor,
np.int16: torch.ShortTensor,
np.int8: torch.CharTensor,
np.uint8: torch.ByteTensor,
}
for dt, tt in dtypes.items():
dset = ScalarDataset(dt)
loader = DataLoader(dset, batch_size=2)
batch = next(iter(loader))
self.assertIsInstance(batch, tt)
class StringDataset(Dataset):
def __init__(self):
self.s = '12345'
def __len__(self):
return len(self.s)
def __getitem__(self, ndx):
return (self.s[ndx], ndx)
class TestStringDataLoader(TestCase):
def setUp(self):
self.dataset = StringDataset()
@unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
def test_shuffle_pin_memory(self):
loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)
for batch_ndx, (s, n) in enumerate(loader):
self.assertIsInstance(s[0], str)
self.assertTrue(n.is_pinned())
class DictDataset(Dataset):
def __len__(self):
return 4
def __getitem__(self, ndx):
return {
'a_tensor': torch.Tensor(4, 2).fill_(ndx),
'another_dict': {
'a_number': ndx,
},
}
class TestDictDataLoader(TestCase):
def setUp(self):
self.dataset = DictDataset()
def test_sequential_batch(self):
loader = DataLoader(self.dataset, batch_size=2, shuffle=False)
batch_size = loader.batch_size
for i, sample in enumerate(loader):
idx = i * batch_size
self.assertEqual(set(sample.keys()), {'a_tensor', 'another_dict'})
self.assertEqual(set(sample['another_dict'].keys()), {'a_number'})
t = sample['a_tensor']
self.assertEqual(t.size(), torch.Size([batch_size, 4, 2]))
self.assertTrue((t[0] == idx).all())
self.assertTrue((t[1] == idx + 1).all())
n = sample['another_dict']['a_number']
self.assertEqual(n.size(), torch.Size([batch_size]))
self.assertEqual(n[0], idx)
self.assertEqual(n[1], idx + 1)
@unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
def test_pin_memory(self):
loader = DataLoader(self.dataset, batch_size=2, pin_memory=True)
for batch_ndx, sample in enumerate(loader):
self.assertTrue(sample['a_tensor'].is_pinned())
self.assertTrue(sample['another_dict']['a_number'].is_pinned())
if __name__ == '__main__':
run_tests()

548
test/test_distributed.py Normal file
View File

@ -0,0 +1,548 @@
import fcntl
import multiprocessing
import os
import sys
import time
import unittest
from functools import wraps, reduce
from contextlib import contextmanager
import torch
import torch.distributed as dist
from common import TestCase
BACKEND = os.environ['BACKEND']
TEMP_DIR = os.environ['TEMP_DIR']
MASTER_PORT = '29500'
MASTER_ADDR = '127.0.0.1'
if not dist.is_available():
print('Distributed not available, skipping tests')
sys.exit(0)
@contextmanager
def _lock():
lockfile = os.path.join(TEMP_DIR, 'lockfile')
with open(lockfile, 'w') as lf:
try:
fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
yield
finally:
fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
lf.close()
def _build_tensor(size, value=None):
if value is None:
value = size
return torch.FloatTensor(size, size, size).fill_(value)
class Barrier(object):
barrier_id = 0
@classmethod
def init(cls):
cls.barrier_id = 0
barrier_dir = os.path.join(TEMP_DIR, 'barrier')
for f_name in os.listdir(barrier_dir):
os.unlink(os.path.join(barrier_dir, f_name))
@classmethod
def sync(cls, timeout=5):
cls.barrier_id += 1
barrier_dir = os.path.join(TEMP_DIR, 'barrier')
pid = str(os.getpid())
barrier_file = os.path.join(barrier_dir, pid)
with _lock():
with open(barrier_file, 'w') as f:
f.write(str(cls.barrier_id))
start_time = time.time()
while True:
arrived = 0
with _lock():
for f_name in os.listdir(barrier_dir):
with open(os.path.join(barrier_dir, f_name), 'r') as f:
data = f.read()
if int(data) >= cls.barrier_id:
arrived += 1
if arrived == dist.get_world_size():
break
if time.time() - start_time > timeout:
raise RuntimeError("barrier timeout")
time.sleep(0.1)
class _DistTestBase(object):
def _barrier(self, *args, **kwargs):
Barrier.sync(*args, **kwargs)
def _init_group_test(self):
group = [1, 2]
group_id = dist.new_group(group)
rank = dist.get_rank()
if rank not in group:
return ([], None, rank)
return (group, group_id, rank)
def _init_global_test(self):
group = [i for i in range(0, dist.get_world_size())]
group_id = dist.group.WORLD
rank = dist.get_rank()
return (group, group_id, rank)
# GET RANK
def test_get_rank(self):
test_dir = os.path.join(TEMP_DIR, 'test_dir')
pid = str(os.getpid())
num_processes = dist.get_world_size()
with open(os.path.join(test_dir, pid), 'w') as f:
f.write(str(dist.get_rank()))
self._barrier()
all_ranks = set()
for f_name in os.listdir(test_dir):
with open(os.path.join(test_dir, f_name), 'r') as f:
all_ranks.add(int(f.read()))
self.assertEqual(len(all_ranks), num_processes)
self._barrier()
if dist.get_rank() == 0:
for f_name in os.listdir(test_dir):
os.unlink(os.path.join(test_dir, f_name))
self._barrier()
# SEND RECV
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support send/recv")
def test_send_recv(self):
rank = dist.get_rank()
tensor = _build_tensor(rank + 1)
for dest in range(0, dist.get_world_size()):
if dest == rank:
continue
dist.send(tensor, dest)
for src in range(0, dist.get_world_size()):
if src == rank:
continue
tensor = _build_tensor(src + 1, value=-1)
expected_tensor = _build_tensor(src + 1)
dist.recv(tensor, src)
self.assertEqual(tensor, expected_tensor)
self._barrier()
# SEND RECV ANY SOURCE
@unittest.skipIf(BACKEND == 'gloo',
"Gloo does not support send/recv from any source")
def test_send_recv_any_source(self):
rank = dist.get_rank()
tensor = _build_tensor(10, rank)
for dest in range(0, dist.get_world_size()):
if dest == rank:
continue
dist.send(tensor, dest)
recv_ranks = set()
for src in range(0, dist.get_world_size()):
if src == rank:
continue
tensor = _build_tensor(10, value=-1)
dist.recv(tensor)
recv_ranks.add(tensor.resize_(1)[0])
self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
self._barrier()
# ISEND
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support isend")
def test_isend(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
requests = [
dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size)
]
for request in requests:
request.wait()
self.assertTrue(request.is_completed())
else:
tensor = _build_tensor(rank, -1)
dist.recv(tensor, 0)
self.assertEqual(tensor, _build_tensor(rank, 10))
self._barrier()
# IRECV
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support irecv")
def test_irecv(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
requests = [
dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size)
]
for src in range(1, world_size):
requests[src - 1].wait()
self.assertTrue(requests[src - 1].is_completed())
self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
else:
tensor = _build_tensor(rank, 10)
dist.send(tensor, 0)
self._barrier()
# BROADCAST
def _test_broadcast_helper(self, group, group_id, rank, cuda=False):
for src in group:
expected_tensor = _build_tensor(src + 1)
if cuda:
expected_tensor = expected_tensor.cuda()
if rank == src:
dist.broadcast(expected_tensor, src, group_id)
else:
tensor = _build_tensor(src + 1, -1)
if cuda:
tensor = tensor.cuda()
dist.broadcast(tensor, src, group_id)
self.assertEqual(tensor, expected_tensor)
self._barrier()
def test_broadcast(self):
group, group_id, rank = self._init_global_test()
self._test_broadcast_helper(group, group_id, rank)
@unittest.skipIf(BACKEND != 'gloo', "Only Gloo backend supports CUDA allReduce")
def test_broadcast_cuda(self):
group, group_id, rank = self._init_global_test()
self._test_broadcast_helper(group, group_id, rank, True)
def test_broadcast_group(self):
group, group_id, rank = self._init_group_test()
self._test_broadcast_helper(group, group_id, rank)
# REDUCE
def _test_reduce_helper(self, group, group_id, rank, op, master_value, worker_value, expected_value):
for src in group:
if rank == src:
tensor = _build_tensor(src + 1).fill_(master_value)
dist.reduce(tensor, src, op, group_id)
self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
else:
tensor = _build_tensor(src + 1).fill_(worker_value)
dist.reduce(tensor, src, op, group_id)
self._barrier()
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
def test_reduce_sum(self):
group, group_id, rank = self._init_global_test()
self._test_reduce_helper(
group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
)
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
def test_reduce_product(self):
group, group_id, rank = self._init_global_test()
self._test_reduce_helper(
group, group_id, rank, dist.reduce_op.PRODUCT,
2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
)
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
def test_reduce_min(self):
group, group_id, rank = self._init_global_test()
self._test_reduce_helper(
group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
)
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
def test_reduce_max(self):
group, group_id, rank = self._init_global_test()
self._test_reduce_helper(
group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
)
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
def test_reduce_group_sum(self):
group, group_id, rank = self._init_group_test()
self._test_reduce_helper(
group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
)
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
def test_reduce_group_product(self):
group, group_id, rank = self._init_group_test()
self._test_reduce_helper(
group, group_id, rank, dist.reduce_op.PRODUCT,
2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
)
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
def test_reduce_group_min(self):
group, group_id, rank = self._init_group_test()
self._test_reduce_helper(
group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
)
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support reduce")
def test_reduce_group_max(self):
group, group_id, rank = self._init_group_test()
self._test_reduce_helper(
group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
)
# ALL REDUCE
def _test_all_reduce_helper(self, group, group_id, rank, op, master_value,
worker_value, expected_value, cuda=False):
for src in group:
if rank == src:
tensor = _build_tensor(src + 1).fill_(master_value)
if cuda:
tensor = tensor.cuda()
dist.all_reduce(tensor, op, group_id)
self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
else:
tensor = _build_tensor(src + 1).fill_(worker_value)
if cuda:
tensor = tensor.cuda()
dist.all_reduce(tensor, op, group_id)
self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
self._barrier()
def test_all_reduce_sum(self):
group, group_id, rank = self._init_global_test()
self._test_all_reduce_helper(
group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
)
@unittest.skipIf(BACKEND != 'gloo', "Only Gloo backend supports CUDA allReduce")
def test_all_reduce_sum_cuda(self):
group, group_id, rank = self._init_global_test()
self._test_all_reduce_helper(
group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1)), True
)
def test_all_reduce_product(self):
group, group_id, rank = self._init_global_test()
self._test_all_reduce_helper(
group, group_id, rank, dist.reduce_op.PRODUCT,
2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
)
def test_all_reduce_min(self):
group, group_id, rank = self._init_global_test()
self._test_all_reduce_helper(
group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
)
def test_all_reduce_max(self):
group, group_id, rank = self._init_global_test()
self._test_all_reduce_helper(
group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
)
def test_all_reduce_group_sum(self):
group, group_id, rank = self._init_group_test()
self._test_all_reduce_helper(
group, group_id, rank, dist.reduce_op.SUM, 2, 10, 2 + (10 * (len(group) - 1))
)
def test_all_reduce_group_product(self):
group, group_id, rank = self._init_group_test()
self._test_all_reduce_helper(
group, group_id, rank, dist.reduce_op.PRODUCT,
2, 10, reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2)
)
def test_all_reduce_group_min(self):
group, group_id, rank = self._init_group_test()
self._test_all_reduce_helper(
group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1
)
def test_all_reduce_group_max(self):
group, group_id, rank = self._init_group_test()
self._test_all_reduce_helper(
group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10
)
# SCATTER
def _test_scatter_helper(self, group, group_id, rank):
for dest in group:
tensor = _build_tensor(dest + 1, -1)
expected_tensor = _build_tensor(dest + 1, rank)
tensors = [_build_tensor(dest + 1, i) for i in group] if rank == dest else []
dist.scatter(tensor, src=dest, scatter_list=tensors, group=group_id)
self.assertEqual(tensor, expected_tensor)
self._barrier()
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support scatter")
def test_scatter(self):
group, group_id, rank = self._init_global_test()
self._test_scatter_helper(group, group_id, rank)
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support scatter")
def test_scatter_group(self):
group, group_id, rank = self._init_group_test()
self._test_scatter_helper(group, group_id, rank)
# GATHER
def _test_gather_helper(self, group, group_id, rank):
for dest in group:
tensor = _build_tensor(dest + 1, rank)
tensors = [_build_tensor(dest + 1, -1) for i in group] if rank == dest else []
dist.gather(tensor, dst=dest, gather_list=tensors, group=group_id)
if rank == dest:
expected_tensors = [_build_tensor(dest + 1, i) for i in group]
for t1, t2 in zip(tensors, expected_tensors):
self.assertEqual(t1, t2)
self._barrier()
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support gather")
def test_gather(self):
group, group_id, rank = self._init_global_test()
self._test_gather_helper(group, group_id, rank)
@unittest.skipIf(BACKEND == 'gloo', "Gloo does not support gather")
def test_gather_group(self):
group, group_id, rank = self._init_group_test()
self._test_gather_helper(group, group_id, rank)
# ALL GATHER
def _test_all_gather_helper(self, group, group_id, rank):
for dest in group:
tensor = _build_tensor(dest + 1, rank)
tensors = [_build_tensor(dest + 1, -1) for i in group]
dist.all_gather(tensors, tensor, group_id)
expected_tensors = [_build_tensor(dest + 1, i) for i in group]
for t1, t2 in zip(tensors, expected_tensors):
self.assertEqual(t1, t2)
self._barrier()
def test_all_gather(self):
group, group_id, rank = self._init_global_test()
self._test_all_gather_helper(group, group_id, rank)
def test_all_gather_group(self):
group, group_id, rank = self._init_group_test()
self._test_all_gather_helper(group, group_id, rank)
# BARRIER
def _test_barrier_helper(self, group, group_id, rank):
WAIT_TIME = 0.3 # seconds
for dest in group:
expected_time = torch.DoubleTensor(1).fill_(0.0)
if dest == rank:
expected_time.fill_(time.time() + WAIT_TIME)
dist.broadcast(expected_time, dest, group_id)
time.sleep(WAIT_TIME + 0.1) # sleep a little bit longer
dist.barrier(group_id)
else:
dist.broadcast(expected_time, dest, group_id)
dist.barrier(group_id)
self.assertGreaterEqual(time.time(), expected_time[0])
self._barrier()
def test_barrier(self):
group, group_id, rank = self._init_global_test()
self._test_barrier_helper(group, group_id, rank)
def test_barrier_group(self):
group, group_id, rank = self._init_group_test()
self._test_barrier_helper(group, group_id, rank)
if BACKEND == 'tcp' or BACKEND == 'gloo':
WORLD_SIZE = os.environ['WORLD_SIZE']
class TestTCPOrGloo(TestCase, _DistTestBase):
MANAGER_PROCESS_RANK = -1
JOIN_TIMEOUT = 10
@staticmethod
def manager_join(fn):
@wraps(fn)
def wrapper(self):
if self.rank == self.MANAGER_PROCESS_RANK:
self._join_and_reduce()
else:
fn(self)
return wrapper
@classmethod
def setUpClass(cls):
os.environ['MASTER_ADDR'] = MASTER_ADDR
os.environ['MASTER_PORT'] = MASTER_PORT
os.environ['WORLD_SIZE'] = WORLD_SIZE
for attr in dir(cls):
if attr.startswith('test'):
fn = getattr(cls, attr)
setattr(cls, attr, cls.manager_join(fn))
def setUp(self):
self.processes = []
self.rank = self.MANAGER_PROCESS_RANK
Barrier.init()
for rank in range(int(WORLD_SIZE)):
self.processes.append(self._spawn_process(rank))
def tearDown(self):
for p in self.processes:
p.terminate()
def _spawn_process(self, rank):
os.environ['RANK'] = str(rank)
name = 'process ' + str(rank)
process = multiprocessing.Process(target=self._run, name=name,
args=(rank,))
process.start()
return process
def _run(self, rank):
self.rank = rank
try:
dist.init_process_group(backend=BACKEND)
except RuntimeError as e:
if 'recompile' in e.args[0]:
sys.exit(0)
# self.id() == e.g. '__main__.TestDistributed.test_get_rank'
# We're retreiving a corresponding test and executing it.
getattr(self, self.id().split(".")[2])()
sys.exit(0)
def _join_and_reduce(self):
for p in self.processes:
p.join(self.JOIN_TIMEOUT)
self.assertEqual(p.exitcode, 0)
elif BACKEND == 'mpi':
dist.init_process_group(backend='mpi')
class TestMPI(TestCase, _DistTestBase):
pass
if __name__ == '__main__':
unittest.main()

File diff suppressed because it is too large Load Diff

View File

@ -1,16 +1,25 @@
import os
import contextlib
import gc
import os
import sys
import time
import unittest
import contextlib
from sys import platform
import torch
import torch.cuda
import torch.multiprocessing as mp
from common import TestCase
from torch.autograd import Variable
from torch.nn import Parameter
from common import TestCase, run_tests
TEST_REPEATS = 30
HAS_SHM_FILES = os.path.isdir('/dev/shm')
TEST_CUDA_IPC = torch.cuda.is_available() and \
sys.version_info[0] == 3 and \
sys.platform != 'darwin'
TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1
def simple_fill(queue, event):
@ -24,6 +33,59 @@ def simple_pool_fill(tensor):
return tensor.add(1)
def send_tensor(queue, event, tp):
t = torch.ones(5, 5).type(tp)
queue.put(t)
queue.put(t)
event.wait()
def sum_tensors(inq, outq):
with torch.cuda.device(1):
tensors = inq.get()
for tensor in tensors:
outq.put((tensor.sum(), tensor.get_device(),
tensor.numel(), tensor.storage().size()))
def queue_get_exception(inqueue, outqueue):
os.close(2) # hide expected error message
try:
torch.zeros(5, 5).cuda()
except Exception as e:
outqueue.put(e)
else:
outqueue.put('no exception')
# Multiply by two in a separate stream
def cuda_multiply_two(queue, ready, done):
ready.set()
with torch.cuda.stream(torch.cuda.Stream()):
cuda_event, tensor = queue.get()
cuda_event.wait()
tensor.mul_(2)
cuda_event.record()
done.set()
del cuda_event
def autograd_sharing(queue, ready, master_modified):
var = queue.get()
ready.set()
master_modified.wait()
expected_var = torch.arange(1, 26).view(5, 5)
expected_var[0, 0] = 1000
is_ok = var.data.equal(expected_var)
var.data[:] = torch.ones(5, 5)
is_ok &= var.grad is None
var._grad = Variable(torch.ones(5, 5), requires_grad=False)
queue.put(is_ok)
@contextlib.contextmanager
def fs_sharing():
prev_strategy = mp.get_sharing_strategy()
@ -41,24 +103,31 @@ class leak_checker(object):
self.test_case = test_case
def __enter__(self):
self.next_fd = self._get_next_fd()
self.next_fds = self._get_next_fds(10)
return self
def __exit__(self, *args):
if args[0] is None:
gc.collect()
self.test_case.assertEqual(self.next_fd, self._get_next_fd())
# Check that the 10th available file-descriptor at the end of the
# test is no more than 4 higher than the 10th available at the
# start. This attempts to catch file descriptor leaks, but allows
# one-off initialization that may use up a file descriptor
# TODO: Disabled because this check is too flaky
# available_fds = self._get_next_fds(10)
# self.test_case.assertLessEqual(
# available_fds[-1] - self.next_fds[-1], 5)
self.test_case.assertFalse(self.has_shm_files())
return False
def check_pid(self, pid):
self.checked_pids.append(pid)
def _get_next_fd(self):
def _get_next_fds(self, n=1):
# dup uses the lowest-numbered unused descriptor for the new descriptor
fd = os.dup(0)
os.close(fd)
return fd
fds = [os.dup(0) for i in range(n)]
for fd in fds:
os.close(fd)
return fds
def has_shm_files(self, wait=True):
if not HAS_SHM_FILES:
@ -81,97 +150,272 @@ class leak_checker(object):
class TestMultiprocessing(TestCase):
def __init__(self, *args, **kwargs):
super(TestMultiprocessing, self).__init__(*args, **kwargs)
def _test_sharing(self):
def do_test():
x = torch.zeros(5, 5)
q = mp.Queue()
e = mp.Event()
def _test_sharing(self, ctx=mp, type=torch.FloatTensor, repeat=1):
def test_fill():
x = torch.zeros(5, 5).type(type)
q = ctx.Queue()
e = ctx.Event()
data = [x, x[:, 1]]
q.put(data)
p = mp.Process(target=simple_fill, args=(q, e))
p = ctx.Process(target=simple_fill, args=(q, e))
p.daemon = True
lc.check_pid(p.pid)
p.start()
e.wait()
e.wait(10)
self.assertTrue(e.is_set())
self.assertTrue(data[0].eq(4).all())
self.assertTrue(data[1].eq(4).all())
p.join(1)
self.assertFalse(p.is_alive())
with leak_checker(self) as lc:
do_test()
def test_receive():
q = ctx.Queue()
e = ctx.Event()
p = ctx.Process(target=send_tensor, args=(q, e, type))
p.daemon = True
lc.check_pid(p.pid)
p.start()
t1 = q.get()
t2 = q.get()
self.assertTrue(t1.eq(1).all())
self.assertTrue(id(t1.storage()) == id(t2.storage()))
e.set()
p.join(1)
self.assertFalse(p.is_alive())
def _test_preserve_sharing(self):
with leak_checker(self) as lc:
for _ in range(repeat):
test_fill()
test_receive()
def _test_preserve_sharing(self, ctx=mp, repeat=1):
def do_test():
x = torch.randn(5, 5)
data = [x.storage(), x, x[2], x[:,1]]
q = mp.Queue()
data = [x.storage(), x.storage()[1:4], x, x[2], x[:, 1]]
q = ctx.Queue()
q.put(data)
new_data = q.get()
new_data = q.get(timeout=1)
self.assertEqual(new_data, data, 0)
storage_cdata = data[0]._cdata
self.assertEqual(new_data[0]._cdata, storage_cdata)
for t in new_data[1:]:
for t in new_data[2:]:
self.assertEqual(t.storage()._cdata, storage_cdata)
# TODO: enable after fixing #46
# new_data[0].fill_(10)
# self.assertEqual(new_data[1], new_data[0][1:4], 0)
with leak_checker(self):
do_test()
for i in range(repeat):
do_test()
def _test_pool(self):
def _test_pool(self, ctx=mp, repeat=1):
def do_test():
p = mp.Pool(2)
p = ctx.Pool(2)
for proc in p._pool:
lc.check_pid(proc.pid)
buffers = (torch.zeros(2, 2) for i in range(4))
buffers = [torch.zeros(2, 2) for i in range(4)]
results = p.map(simple_pool_fill, buffers, 1)
self.assertEqual(len(results), len(buffers))
for r in results:
self.assertEqual(r, torch.ones(2, 2) * 5, 0)
self.assertEqual(len(results), 4)
for b in buffers:
self.assertEqual(b, torch.ones(2, 2) * 4, 0)
p.close()
p.join()
with leak_checker(self) as lc:
do_test()
for i in range(repeat):
do_test()
@unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on OS X")
def test_fd_sharing(self):
self._test_sharing()
self._test_sharing(repeat=TEST_REPEATS)
@unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on OS X")
def test_fd_preserve_sharing(self):
self._test_preserve_sharing()
self._test_preserve_sharing(repeat=TEST_REPEATS)
@unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on OS X")
def test_fd_pool(self):
self._test_pool()
self._test_pool(repeat=TEST_REPEATS)
@unittest.skipIf(platform == "darwin", "file_system sharing strategy doesn't work in OSX")
def test_fs_sharing(self):
with fs_sharing():
self._test_sharing()
self._test_sharing(repeat=TEST_REPEATS)
def test_fs_preserve_sharing(self):
with fs_sharing():
self._test_preserve_sharing()
self._test_preserve_sharing(repeat=TEST_REPEATS)
def test_fs_pool(self):
with fs_sharing():
self._test_pool()
self._test_pool(repeat=TEST_REPEATS)
@unittest.skipIf(not HAS_SHM_FILES, "don't not how to check if shm files exist")
def test_fs(self):
with fs_sharing(), leak_checker(self) as lc:
def queue_put():
x = torch.DoubleStorage(4)
q = mp.Queue()
self.assertFalse(lc.has_shm_files())
q.put(x)
time.sleep(0.05) # queue serializes asynchronously
self.assertTrue(lc.has_shm_files(wait=False))
q.get()
del x
del q # We have to clean up fds for leak_checker
with fs_sharing(), leak_checker(self) as lc:
for _ in range(TEST_REPEATS):
queue_put()
def test_inherit_tensor(self):
class SubProcess(mp.Process):
def __init__(self, tensor):
super(SubProcess, self).__init__()
self.tensor = tensor
self.daemon = True
def run(self):
self.tensor.add_(3)
t = torch.zeros(5, 5)
p = SubProcess(t.share_memory_())
p.start()
p.join(1)
self.assertEqual(t, torch.ones(5, 5) * 3, 0)
@unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
def test_cuda(self):
torch.cuda.FloatTensor([1]) # initialize CUDA outside of leak checker
self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)
@unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
@unittest.skipIf(not TEST_MULTIGPU, 'found only 1 GPU')
def test_cuda_small_tensors(self):
# Check multiple small tensors which will likely use the same
# underlying cached allocation
ctx = mp.get_context('spawn')
tensors = []
for i in range(5):
device = i % 2
tensors += [torch.arange(i * 5, (i + 1) * 5).cuda(device)]
inq = ctx.Queue()
outq = ctx.Queue()
inq.put(tensors)
p = ctx.Process(target=sum_tensors, args=(inq, outq))
p.start()
results = []
for i in range(5):
results.append(outq.get())
p.join()
for i, tensor in enumerate(tensors):
v, device, tensor_size, storage_size = results[i]
self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum())
self.assertEqual(device, i % 2)
self.assertEqual(tensor_size, 5)
self.assertEqual(storage_size, 5)
@unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
def test_cuda_bad_call(self):
# Initialize CUDA
t = torch.zeros(5, 5).cuda().cpu()
inq = mp.Queue()
outq = mp.Queue()
p = mp.Process(target=queue_get_exception, args=(inq, outq))
p.start()
inq.put(t)
p.join()
self.assertIsInstance(outq.get(), RuntimeError)
@unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
def test_event(self):
ctx = mp.get_context('spawn')
queue = ctx.Queue()
ready = ctx.Event()
done = ctx.Event()
p = ctx.Process(target=cuda_multiply_two, args=(queue, ready, done))
p.start()
ready.wait()
with torch.cuda.stream(torch.cuda.Stream()):
tensor = torch.cuda.FloatTensor([1, 1, 1, 1])
# Use a sleep kernel to test events. Without the event, the
# multiply happens before the add.
event = torch.cuda.Event(interprocess=True)
torch.cuda._sleep(20000000) # about 30 ms
tensor.add_(1)
event.record()
queue.put((event, tensor))
done.wait() # must wait until subprocess records event
event.synchronize()
self.assertEqual(list(tensor), [4, 4, 4, 4])
p.join()
def _test_autograd_sharing(self, var):
ready = mp.Event()
master_modified = mp.Event()
queue = mp.Queue()
p = mp.Process(target=autograd_sharing, args=(queue, ready, master_modified))
p.daemon = True
p.start()
var._grad = Variable(torch.zeros(5, 5), requires_grad=False)
queue.put(var)
ready.wait()
var.data[0, 0] = 1000
var.grad.data[:] = torch.ones(5, 5) * 4
master_modified.set()
worker_ok = queue.get()
self.assertTrue(worker_ok)
self.assertEqual(var.data, torch.ones(5, 5))
self.assertEqual(var.grad.data, torch.ones(5, 5) * 4)
p.join(1)
self.assertFalse(p.is_alive())
def test_variable_sharing(self):
configs = [
(True, False),
(False, False),
(False, True),
]
for requires_grad, volatile in configs:
var = Variable(torch.arange(1, 26).view(5, 5),
requires_grad=requires_grad,
volatile=volatile)
self._test_autograd_sharing(var)
def test_parameter_sharing(self):
param = Parameter(torch.arange(1, 26).view(5, 5))
self._test_autograd_sharing(param)
def test_empty_shared(self):
t = torch.Tensor()
t.share_memory_()
def _test_is_shared(self):
t = torch.randn(5, 5)
self.assertFalse(t.is_shared())
t.share_memory_()
self.assertTrue(t.is_shared())
@unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on OS X")
def test_is_shared(self):
self._test_is_shared()
def test_fs_is_shared(self):
with fs_sharing():
self._test_is_shared()
@unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
def test_is_shared_cuda(self):
t = torch.randn(5, 5).cuda()
self.assertTrue(t.is_shared())
if __name__ == '__main__':
unittest.main()
run_tests()

88
test/test_nccl.py Normal file
View File

@ -0,0 +1,88 @@
import unittest
import torch
import torch.cuda.nccl as nccl
import torch.cuda
from common import TestCase, run_tests
nGPUs = torch.cuda.device_count()
if nGPUs == 0:
print('CUDA not available, skipping tests')
TestCase = object # noqa: F811
class TestNCCL(TestCase):
@unittest.skipIf(nGPUs < 2, "only one GPU detected")
def test_broadcast(self):
expected = torch.FloatTensor(128).uniform_()
tensors = [expected.cuda()]
for device in range(1, torch.cuda.device_count()):
with torch.cuda.device(device):
tensors.append(torch.cuda.FloatTensor(128))
nccl.broadcast(tensors)
for i in range(torch.cuda.device_count()):
self.assertEqual(tensors[i], expected)
@unittest.skipIf(nGPUs < 2, "only one GPU detected")
def test_reduce(self):
tensors = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)]
expected = torch.FloatTensor(128).zero_()
for t in tensors:
expected.add_(t)
tensors = [tensors[i].cuda(i) for i in range(nGPUs)]
nccl.reduce(tensors)
self.assertEqual(tensors[0], expected)
@unittest.skipIf(nGPUs < 2, "only one GPU detected")
def test_all_reduce(self):
tensors = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)]
expected = torch.FloatTensor(128).zero_()
for t in tensors:
expected.add_(t)
tensors = [tensors[i].cuda(i) for i in range(nGPUs)]
nccl.all_reduce(tensors)
for tensor in tensors:
self.assertEqual(tensor, expected)
@unittest.skipIf(nGPUs < 2, "only one GPU detected")
def test_all_gather(self):
inputs = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)]
expected = torch.cat(inputs, 0)
inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
outputs = [torch.cuda.FloatTensor(128 * nGPUs, device=i)
for i in range(nGPUs)]
nccl.all_gather(inputs, outputs)
for tensor in outputs:
self.assertEqual(tensor, expected)
@unittest.skipIf(nGPUs < 2, "only one GPU detected")
def test_reduce_scatter(self):
in_size = 32 * nGPUs
out_size = 32
inputs = [torch.FloatTensor(in_size).uniform_() for i in range(nGPUs)]
expected = torch.FloatTensor(in_size).zero_()
for t in inputs:
expected.add_(t)
expected = expected.view(nGPUs, 32)
inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
outputs = [torch.cuda.FloatTensor(out_size, device=i)
for i in range(nGPUs)]
nccl.reduce_scatter(inputs, outputs)
for i in range(nGPUs):
self.assertEqual(outputs[i], expected[i])
if __name__ == '__main__':
run_tests()

File diff suppressed because it is too large Load Diff

558
test/test_optim.py Normal file
View File

@ -0,0 +1,558 @@
import unittest
import functools
from copy import deepcopy
import torch
import torch.optim as optim
import torch.legacy.optim as old_optim
import torch.nn.functional as F
from torch.optim import SGD
from torch.autograd import Variable
from torch import sparse
from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, ExponentialLR, ReduceLROnPlateau
from common import TestCase, run_tests
def rosenbrock(tensor):
x, y = tensor
return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
def drosenbrock(tensor):
x, y = tensor
return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))
def wrap_old_fn(old_fn, **config):
def wrapper(closure, params, state):
return old_fn(closure, params, config, state)
return wrapper
class TestOptim(TestCase):
def _test_rosenbrock(self, constructor, old_fn):
params_t = torch.Tensor([1.5, 1.5])
state = {}
params = Variable(torch.Tensor([1.5, 1.5]), requires_grad=True)
optimizer = constructor([params])
solution = torch.Tensor([1, 1])
initial_dist = params.data.dist(solution)
def eval():
optimizer.zero_grad()
loss = rosenbrock(params)
loss.backward()
# loss.backward() will give **slightly** different
# gradients, than drosenbtock, because of a different ordering
# of floating point operations. In most cases it doesn't matter,
# but some optimizers are so sensitive that they can temporarily
# diverge up to 1e-4, just to converge again. This makes the
# comparison more stable.
params.grad.data.copy_(drosenbrock(params.data))
return loss
for i in range(2000):
optimizer.step(eval)
old_fn(lambda _: (rosenbrock(params_t), drosenbrock(params_t)),
params_t, state)
self.assertEqual(params.data, params_t)
self.assertLessEqual(params.data.dist(solution), initial_dist)
def _test_rosenbrock_sparse(self, constructor):
params_t = torch.Tensor([1.5, 1.5])
params = Variable(torch.Tensor([1.5, 1.5]), requires_grad=True)
params_c = Variable(torch.Tensor([1.5, 1.5]), requires_grad=True)
optimizer = constructor([params])
optimizer_c = constructor([params_c])
solution = torch.Tensor([1, 1])
initial_dist = params.data.dist(solution)
def eval(params, sparse_grad, w):
# Depending on w, provide only the x or y gradient
optimizer.zero_grad()
loss = rosenbrock(params)
loss.backward()
grad = drosenbrock(params.data)
# NB: We torture test the optimizer by returning an
# uncoalesced sparse tensor
if w:
i = torch.LongTensor([[0, 0]])
x = grad[0]
v = torch.DoubleTensor([x / 4., x - x / 4.])
else:
i = torch.LongTensor([[1, 1]])
y = grad[1]
v = torch.DoubleTensor([y - y / 4., y / 4.])
x = sparse.DoubleTensor(i, v, torch.Size([2]))
if sparse_grad:
params.grad.data = x
else:
params.grad.data = x.to_dense()
return loss
for i in range(2000):
# Do cyclic coordinate descent
w = i % 2
optimizer.step(functools.partial(eval, params, True, w))
optimizer_c.step(functools.partial(eval, params_c, False, w))
self.assertEqual(params.data, params_c.data)
self.assertLessEqual(params.data.dist(solution), initial_dist)
def _test_basic_cases_template(self, weight, bias, input, constructor):
weight = Variable(weight, requires_grad=True)
bias = Variable(bias, requires_grad=True)
input = Variable(input)
optimizer = constructor(weight, bias)
def fn():
optimizer.zero_grad()
y = weight.mv(input)
if y.is_cuda and bias.is_cuda and y.get_device() != bias.get_device():
y = y.cuda(bias.get_device())
loss = (y + bias).pow(2).sum()
loss.backward()
return loss
initial_value = fn().data[0]
for i in range(200):
optimizer.step(fn)
self.assertLess(fn().data[0], initial_value)
def _test_state_dict(self, weight, bias, input, constructor):
weight = Variable(weight, requires_grad=True)
bias = Variable(bias, requires_grad=True)
input = Variable(input)
def fn_base(optimizer, weight, bias):
optimizer.zero_grad()
loss = (weight.mv(input) + bias).pow(2).sum()
loss.backward()
return loss
optimizer = constructor(weight, bias)
fn = functools.partial(fn_base, optimizer, weight, bias)
# Prime the optimizer
for i in range(20):
optimizer.step(fn)
# Clone the weights and construct new optimizer for them
weight_c = Variable(weight.data.clone(), requires_grad=True)
bias_c = Variable(bias.data.clone(), requires_grad=True)
optimizer_c = constructor(weight_c, bias_c)
fn_c = functools.partial(fn_base, optimizer_c, weight_c, bias_c)
# Load state dict
state_dict = deepcopy(optimizer.state_dict())
state_dict_c = deepcopy(optimizer.state_dict())
optimizer_c.load_state_dict(state_dict_c)
# Run both optimizations in parallel
for i in range(20):
optimizer.step(fn)
optimizer_c.step(fn_c)
self.assertEqual(weight, weight_c)
self.assertEqual(bias, bias_c)
# Make sure state dict wasn't modified
self.assertEqual(state_dict, state_dict_c)
def _test_basic_cases(self, constructor, ignore_multidevice=False):
self._test_state_dict(
torch.randn(10, 5),
torch.randn(10),
torch.randn(5),
constructor
)
self._test_basic_cases_template(
torch.randn(10, 5),
torch.randn(10),
torch.randn(5),
constructor
)
# non-contiguous parameters
self._test_basic_cases_template(
torch.randn(10, 5, 2)[..., 0],
torch.randn(10, 2)[..., 0],
torch.randn(5),
constructor
)
# CUDA
if not torch.cuda.is_available():
return
self._test_basic_cases_template(
torch.randn(10, 5).cuda(),
torch.randn(10).cuda(),
torch.randn(5).cuda(),
constructor
)
# Multi-GPU
if not torch.cuda.device_count() > 1 or ignore_multidevice:
return
self._test_basic_cases_template(
torch.randn(10, 5).cuda(0),
torch.randn(10).cuda(1),
torch.randn(5).cuda(0),
constructor
)
def _build_params_dict(self, weight, bias, **kwargs):
return [dict(params=[weight]), dict(params=[bias], **kwargs)]
def _build_params_dict_single(self, weight, bias, **kwargs):
return [dict(params=bias, **kwargs)]
def test_sgd(self):
self._test_rosenbrock(
lambda params: optim.SGD(params, lr=1e-3),
wrap_old_fn(old_optim.sgd, learningRate=1e-3)
)
self._test_rosenbrock(
lambda params: optim.SGD(params, lr=1e-3, momentum=0.9,
dampening=0, weight_decay=1e-4),
wrap_old_fn(old_optim.sgd, learningRate=1e-3, momentum=0.9,
dampening=0, weightDecay=1e-4)
)
self._test_basic_cases(
lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.SGD(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.SGD(
self._build_params_dict_single(weight, bias, lr=1e-2),
lr=1e-3)
)
def test_adam(self):
self._test_rosenbrock(
lambda params: optim.Adam(params, lr=1e-2),
wrap_old_fn(old_optim.adam, learningRate=1e-2)
)
self._test_rosenbrock(
lambda params: optim.Adam(params, lr=1e-2, weight_decay=1e-2),
wrap_old_fn(old_optim.adam, learningRate=1e-2, weightDecay=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adam([weight, bias], lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.Adam(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3)
)
def test_adadelta(self):
self._test_rosenbrock(
lambda params: optim.Adadelta(params),
wrap_old_fn(old_optim.adadelta)
)
self._test_rosenbrock(
lambda params: optim.Adadelta(params, rho=0.95),
wrap_old_fn(old_optim.adadelta, rho=0.95)
)
self._test_rosenbrock(
lambda params: optim.Adadelta(params, weight_decay=1e-2),
wrap_old_fn(old_optim.adadelta, weightDecay=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adadelta([weight, bias])
)
self._test_basic_cases(
lambda weight, bias: optim.Adadelta(
self._build_params_dict(weight, bias, rho=0.95))
)
def test_adagrad(self):
self._test_rosenbrock(
lambda params: optim.Adagrad(params, lr=1e-1),
wrap_old_fn(old_optim.adagrad, learningRate=1e-1)
)
self._test_rosenbrock(
lambda params: optim.Adagrad(params, lr=1e-1, lr_decay=1e-3),
wrap_old_fn(old_optim.adagrad, learningRate=1e-1, learningRateDecay=1e-3)
)
self._test_rosenbrock(
lambda params: optim.Adagrad(params, lr=1e-1, weight_decay=1e-2),
wrap_old_fn(old_optim.adagrad, learningRate=1e-1, weightDecay=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-1)
)
def test_adagrad_sparse(self):
self._test_rosenbrock_sparse(
lambda params: optim.Adagrad(params, lr=1e-1)
)
def test_adamax(self):
self._test_rosenbrock(
lambda params: optim.Adamax(params, lr=1e-1),
wrap_old_fn(old_optim.adamax, learningRate=1e-1)
)
self._test_rosenbrock(
lambda params: optim.Adamax(params, lr=1e-1, weight_decay=1e-2),
wrap_old_fn(old_optim.adamax, learningRate=1e-1, weightDecay=1e-2)
)
self._test_rosenbrock(
lambda params: optim.Adamax(params, lr=1e-1, betas=(0.95, 0.998)),
wrap_old_fn(old_optim.adamax, learningRate=1e-1, beta1=0.95, beta2=0.998)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-1)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-1)
)
def test_rmsprop(self):
self._test_rosenbrock(
lambda params: optim.RMSprop(params, lr=1e-2),
wrap_old_fn(old_optim.rmsprop, learningRate=1e-2)
)
self._test_rosenbrock(
lambda params: optim.RMSprop(params, lr=1e-2, weight_decay=1e-2),
wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, weightDecay=1e-2)
)
self._test_rosenbrock(
lambda params: optim.RMSprop(params, lr=1e-2, alpha=0.95),
wrap_old_fn(old_optim.rmsprop, learningRate=1e-2, alpha=0.95)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad([weight, bias], lr=1e-2)
)
self._test_basic_cases(
lambda weight, bias: optim.Adagrad(
self._build_params_dict(weight, bias, lr=1e-3),
lr=1e-2)
)
def test_asgd(self):
self._test_rosenbrock(
lambda params: optim.ASGD(params, lr=1e-3),
wrap_old_fn(old_optim.asgd, eta0=1e-3)
)
self._test_rosenbrock(
lambda params: optim.ASGD(params, lr=1e-3, alpha=0.8),
wrap_old_fn(old_optim.asgd, eta0=1e-3, alpha=0.8)
)
self._test_rosenbrock(
lambda params: optim.ASGD(params, lr=1e-3, t0=1e3),
wrap_old_fn(old_optim.asgd, eta0=1e-3, t0=1e3)
)
self._test_basic_cases(
lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
)
self._test_basic_cases(
lambda weight, bias: optim.ASGD(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3, t0=100)
)
def test_rprop(self):
self._test_rosenbrock(
lambda params: optim.Rprop(params, lr=1e-3),
wrap_old_fn(old_optim.rprop, stepsize=1e-3)
)
self._test_rosenbrock(
lambda params: optim.Rprop(params, lr=1e-3, etas=(0.6, 1.1)),
wrap_old_fn(old_optim.rprop, stepsize=1e-3, etaminus=0.6, etaplus=1.1)
)
self._test_rosenbrock(
lambda params: optim.Rprop(params, lr=1e-3, step_sizes=(1e-4, 3)),
wrap_old_fn(old_optim.rprop, stepsize=1e-3, stepsizemin=1e-4, stepsizemax=3)
)
self._test_basic_cases(
lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.Rprop(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3)
)
def test_lbfgs(self):
self._test_rosenbrock(
lambda params: optim.LBFGS(params),
wrap_old_fn(old_optim.lbfgs)
)
self._test_rosenbrock(
lambda params: optim.LBFGS(params, lr=5e-2, max_iter=5),
wrap_old_fn(old_optim.lbfgs, learningRate=5e-2, maxIter=5)
)
self._test_basic_cases(
lambda weight, bias: optim.LBFGS([weight, bias]),
ignore_multidevice=True
)
def test_invalid_param_type(self):
with self.assertRaises(TypeError):
optim.SGD(Variable(torch.randn(5, 5)), lr=3)
class SchedulerTestNet(torch.nn.Module):
def __init__(self):
super(SchedulerTestNet, self).__init__()
self.conv1 = torch.nn.Conv2d(1, 1, 1)
self.conv2 = torch.nn.Conv2d(1, 1, 1)
def forward(self, x):
return self.conv2(F.relu(self.conv1(x)))
class TestLRScheduler(TestCase):
def setUp(self):
self.net = SchedulerTestNet()
self.opt = SGD(
[{'params': self.net.conv1.parameters()}, {'params': self.net.conv2.parameters(), 'lr': 0.5}],
lr=0.05)
def test_step_lr(self):
# lr = 0.05 if epoch < 3
# lr = 0.005 if 30 <= epoch < 6
# lr = 0.0005 if epoch >= 9
single_targets = [0.05] * 3 + [0.005] * 3 + [0.0005] * 3 + [0.00005] * 3
targets = [single_targets, list(map(lambda x: x * 10, single_targets))]
scheduler = StepLR(self.opt, gamma=0.1, step_size=3)
epochs = 10
self._test(scheduler, targets, epochs)
def test_multi_step_lr(self):
# lr = 0.05 if epoch < 2
# lr = 0.005 if 2 <= epoch < 5
# lr = 0.0005 if epoch < 9
# lr = 0.00005 if epoch >= 9
single_targets = [0.05] * 2 + [0.005] * 3 + [0.0005] * 4 + [0.00005] * 3
targets = [single_targets, list(map(lambda x: x * 10, single_targets))]
scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
epochs = 10
self._test(scheduler, targets, epochs)
def test_exp_lr(self):
single_targets = [0.05 * (0.9 ** x) for x in range(10)]
targets = [single_targets, list(map(lambda x: x * 10, single_targets))]
scheduler = ExponentialLR(self.opt, gamma=0.9)
epochs = 10
self._test(scheduler, targets, epochs)
def test_reduce_lr_on_plateau1(self):
for param_group in self.opt.param_groups:
param_group['lr'] = 0.5
targets = [[0.5] * 20]
metrics = [10 - i * 0.0167 for i in range(20)]
scheduler = ReduceLROnPlateau(self.opt, threshold_mode='abs', mode='min',
threshold=0.01, patience=5, cooldown=5)
epochs = 10
self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
def test_reduce_lr_on_plateau2(self):
for param_group in self.opt.param_groups:
param_group['lr'] = 0.5
targets = [[0.5] * 6 + [0.05] * 7 + [0.005] * 7 + [0.0005] * 2]
metrics = [10 - i * 0.0165 for i in range(22)]
scheduler = ReduceLROnPlateau(self.opt, patience=5, cooldown=0, threshold_mode='abs',
mode='min', threshold=0.1)
epochs = 22
self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
def test_reduce_lr_on_plateau3(self):
for param_group in self.opt.param_groups:
param_group['lr'] = 0.5
targets = [[0.5] * (2 + 6) + [0.05] * (5 + 6) + [0.005] * 4]
metrics = [-0.8] * 2 + [-0.234] * 20
scheduler = ReduceLROnPlateau(self.opt, mode='max', patience=5, cooldown=5,
threshold_mode='abs')
epochs = 22
self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
def test_reduce_lr_on_plateau4(self):
for param_group in self.opt.param_groups:
param_group['lr'] = 0.5
targets = [[0.5] * 20]
metrics = [1.5 * (1.025 ** i) for i in range(20)] # 1.025 > 1.1**0.25
scheduler = ReduceLROnPlateau(self.opt, mode='max', patience=3,
threshold_mode='rel', threshold=0.1)
epochs = 20
self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
def test_reduce_lr_on_plateau5(self):
for param_group in self.opt.param_groups:
param_group['lr'] = 0.5
targets = [[0.5] * 6 + [0.05] * (5 + 6) + [0.005] * 4]
metrics = [1.5 * (1.005 ** i) for i in range(20)]
scheduler = ReduceLROnPlateau(self.opt, mode='max', threshold_mode='rel',
threshold=0.1, patience=5, cooldown=5)
epochs = 20
self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
def test_reduce_lr_on_plateau6(self):
for param_group in self.opt.param_groups:
param_group['lr'] = 0.5
targets = [[0.5] * 20]
metrics = [1.5 * (0.85 ** i) for i in range(20)]
scheduler = ReduceLROnPlateau(self.opt, mode='min', threshold_mode='rel',
threshold=0.1)
epochs = 20
self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
def test_reduce_lr_on_plateau7(self):
for param_group in self.opt.param_groups:
param_group['lr'] = 0.5
targets = [[0.5] * 6 + [0.05] * (5 + 6) + [0.005] * 4]
metrics = [1] * 7 + [0.6] + [0.5] * 12
scheduler = ReduceLROnPlateau(self.opt, mode='min', threshold_mode='rel',
threshold=0.1, patience=5, cooldown=5)
epochs = 20
self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
def test_reduce_lr_on_plateau8(self):
for param_group in self.opt.param_groups:
param_group['lr'] = 0.5
targets = [[0.5] * 6 + [0.4] * 14, [0.5] * 6 + [0.3] * 14]
metrics = [1.5 * (1.005 ** i) for i in range(20)]
scheduler = ReduceLROnPlateau(self.opt, mode='max', threshold_mode='rel', min_lr=[0.4, 0.3],
threshold=0.1, patience=5, cooldown=5)
epochs = 20
self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
def test_lambda_lr(self):
self.opt.param_groups[0]['lr'] = 0.05
self.opt.param_groups[1]['lr'] = 0.4
targets = [[0.05 * (0.9 ** x) for x in range(10)], [0.4 * (0.8 ** x) for x in range(10)]]
scheduler = LambdaLR(self.opt,
lr_lambda=[lambda x1: 0.9 ** x1, lambda x2: 0.8 ** x2])
epochs = 10
self._test(scheduler, targets, epochs)
def _test(self, scheduler, targets, epochs=10):
for epoch in range(epochs):
scheduler.step(epoch)
for param_group, target in zip(self.opt.param_groups, targets):
self.assertAlmostEqual(target[epoch], param_group['lr'],
msg='LR is wrong in epoch {}: expected {}, got {}'.format(
epoch, target[epoch], param_group['lr']), delta=1e-5)
def _test_reduce_lr_on_plateau(self, scheduler, targets, metrics, epochs=10, verbose=False):
for epoch in range(epochs):
scheduler.step(metrics[epoch])
if verbose:
print('epoch{}:\tlr={}'.format(epoch, self.opt.param_groups[0]['lr']))
for param_group, target in zip(self.opt.param_groups, targets):
self.assertAlmostEqual(target[epoch], param_group['lr'],
msg='LR is wrong in epoch {}: expected {}, got {}'.format(
epoch, target[epoch], param_group['lr']), delta=1e-5)
if __name__ == '__main__':
run_tests()

625
test/test_sparse.py Normal file
View File

@ -0,0 +1,625 @@
import torch
from torch import sparse
import itertools
import random
import unittest
from common import TestCase, run_tests
from common_nn import TEST_CUDA
from numbers import Number
def cpu_only(inner):
def outer(self, *args, **kwargs):
if self.is_cuda:
raise unittest.SkipTest("Test is CPU-only")
inner(self, *args, **kwargs)
return outer
def cuda_only(inner):
def outer(self, *args, **kwargs):
if not self.is_cuda:
raise unittest.SkipTest("Test is GPU-only")
inner(self, *args, **kwargs)
return outer
class TestSparse(TestCase):
def setUp(self):
# These parameters control the various ways we can run the test.
# We will subclass and override this method to implement CUDA
# tests
self.is_cuda = False
self.is_uncoalesced = False
self.IndexTensor = torch.LongTensor
self.ValueTensor = torch.DoubleTensor
self.SparseTensor = torch.sparse.DoubleTensor
def _gen_sparse(self, d, nnz, with_size):
# TODO: Consider implementing this in the CUDA case by directly
# performing the operations on the GPU. You won't be able to
# use torch.rand/torch.randn in this case because they are
# CPU-only. If you do this, you can remove the is_cuda branch
# at the end.
#
# If you do this, be sure to update assert_uncoalesced too
if isinstance(with_size, Number):
with_size = [with_size] * d
if self.is_uncoalesced:
# We want to generate a tensor with a lot of uncoalesced
# entries to stress test whether or not we handle this
# (subtle) case correctly
v_size = [nnz * 2] + list(with_size[d:])
v = torch.randn(*v_size)
r = torch.rand(d, nnz)
# Repeat the indexes, so every position shows up twice
i = torch.cat([r, r], dim=1) * \
torch.Tensor(with_size[:d]).repeat(nnz * 2, 1).transpose(0, 1)
i = i.type(torch.LongTensor)
x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
self.assert_uncoalesced(x)
else:
# Generate a sparse tensor with d sparse dimensions; the
# rest the dimensions with_size[d:] are dense.
v_size = [nnz] + list(with_size[d:])
v = torch.randn(*v_size)
i = torch.rand(d, nnz) * \
torch.Tensor(with_size[:d]).repeat(nnz, 1).transpose(0, 1)
i = i.type(torch.LongTensor)
x = torch.sparse.DoubleTensor(i, v, torch.Size(with_size))
if self.is_cuda:
return x.cuda(), i.cuda(), v.cuda()
else:
return x, i.clone(), v.clone()
def assert_uncoalesced(self, x):
"""
Test if a CPU tensor is uncoalesced. This is used to ensure
correctness of the uncoalesced tensor generation algorithm.
"""
assert not x.is_coalesced()
# Strategy: construct a new sparse tensor with the raw value
# field overwritten to a tensor of ones, coalesce it, and then
# check if any value entries are > 1 (which indicates that the
# original was uncoalesced.)
i = x._indices().clone()
v = x._values().clone().fill_(1)
y = torch.sparse.DoubleTensor(i, v, x.size())
z = self.safeCoalesce(y)
assert (z._values() > 1).sum() > 0
def randn(self, *args, **kwargs):
"""
Variant of torch.randn that also works in the TEST_CUDA case.
"""
# TODO: Put this in torch.cuda.randn
return self.ValueTensor(*args, **kwargs).normal_()
def test_basic(self):
x, i, v = self._gen_sparse(3, 10, 100)
self.assertEqual(i, x._indices())
self.assertEqual(v, x._values())
x, i, v = self._gen_sparse(3, 10, [100, 100, 100])
self.assertEqual(i, x._indices())
self.assertEqual(v, x._values())
self.assertEqual(x.ndimension(), 3)
self.assertEqual(x.coalesce()._nnz(), 10)
for i in range(3):
self.assertEqual(x.size(i), 100)
# Make sure we can access empty indices / values
x = self.SparseTensor()
self.assertEqual(x._indices().numel(), 0)
self.assertEqual(x._values().numel(), 0)
def test_to_dense(self):
i = self.IndexTensor([
[0, 1, 2, 2],
[0, 0, 0, 3],
[0, 0, 1, 4],
])
v = self.ValueTensor([2, 1, 3, 4])
x = self.SparseTensor(i, v, torch.Size([3, 4, 5]))
res = self.ValueTensor([
[[2, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]],
[[1, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]],
[[0, 3, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 4]],
])
x.to_dense() # Tests double to_dense for memory corruption
x.to_dense()
x.to_dense()
self.assertEqual(res, x.to_dense())
def test_shared(self):
i = self.IndexTensor([[2]])
v = self.ValueTensor([5])
x = self.SparseTensor(i, v, torch.Size([3]))
v[0] = 6
self.assertEqual(self.ValueTensor([0, 0, 6]), x.to_dense())
i[0][0] = 0
self.assertEqual(self.ValueTensor([6, 0, 0]), x.to_dense())
def test_to_dense_hybrid(self):
i = self.IndexTensor([
[0, 1, 2, 2],
[0, 0, 0, 3],
])
v = self.ValueTensor([[2, 3], [1, 2], [3, 4], [4, 5]])
x = self.SparseTensor(i, v, torch.Size([3, 4, 2]))
res = self.ValueTensor([
[[2, 3],
[0, 0],
[0, 0],
[0, 0]],
[[1, 2],
[0, 0],
[0, 0],
[0, 0]],
[[3, 4],
[0, 0],
[0, 0],
[4, 5]],
])
x.to_dense() # Tests double to_dense for memory corruption
x.to_dense()
x.to_dense()
self.assertEqual(res, x.to_dense())
def test_contig(self):
i = self.IndexTensor([
[1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
[92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
])
v = self.ValueTensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
x = self.SparseTensor(i, v, torch.Size([100, 100]))
exp_i = self.IndexTensor([
[0, 1, 6, 14, 27, 35, 39, 40, 66, 71],
[31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
])
exp_v = self.ValueTensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7])
x = self.safeCoalesce(x)
self.assertEqual(exp_i, x._indices())
self.assertEqual(exp_v, x._values())
i = self.IndexTensor([
[2, 0, 2, 1],
[0, 0, 3, 0],
[1, 0, 4, 0],
])
v = self.ValueTensor([3, 2, 4, 1])
x = self.SparseTensor(i, v, torch.Size([3, 4, 5]))
exp_i = self.IndexTensor([
[0, 1, 2, 2],
[0, 0, 0, 3],
[0, 0, 1, 4],
])
exp_v = self.ValueTensor([2, 1, 3, 4])
x = self.safeCoalesce(x)
self.assertEqual(exp_i, x._indices())
self.assertEqual(exp_v, x._values())
# Duplicate indices
i = self.IndexTensor([
[0, 0, 2, 0],
[0, 0, 3, 0],
[0, 0, 4, 0],
])
v = self.ValueTensor([3, 2, 4, 1])
x = self.SparseTensor(i, v, torch.Size([3, 4, 5]))
exp_i = self.IndexTensor([
[0, 2],
[0, 3],
[0, 4],
])
exp_v = self.ValueTensor([6, 4])
x = self.safeCoalesce(x)
self.assertEqual(exp_i, x._indices())
self.assertEqual(exp_v, x._values())
def test_contig_hybrid(self):
i = self.IndexTensor([
[1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
[92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
])
v = self.ValueTensor([
[1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
[6, 7], [7, 8], [8, 9], [9, 10], [10, 11],
])
x = self.SparseTensor(i, v, torch.Size([100, 100, 2]))
exp_i = self.IndexTensor([
[0, 1, 6, 14, 27, 35, 39, 40, 66, 71],
[31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
])
exp_v = self.ValueTensor([
[2, 3], [1, 2], [6, 7], [4, 5], [10, 11],
[3, 4], [5, 6], [9, 10], [8, 9], [7, 8],
])
x = self.safeCoalesce(x)
self.assertEqual(exp_i, x._indices())
self.assertEqual(exp_v, x._values())
i = self.IndexTensor([
[2, 0, 2, 1],
[0, 0, 3, 0],
[1, 0, 4, 0],
])
v = self.ValueTensor([[3, 3, 3], [2, 2, 2], [4, 4, 4], [1, 1, 1]])
x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3]))
exp_i = self.IndexTensor([
[0, 1, 2, 2],
[0, 0, 0, 3],
[0, 0, 1, 4],
])
exp_v = self.ValueTensor([[2, 2, 2], [1, 1, 1], [3, 3, 3], [4, 4, 4]])
x = self.safeCoalesce(x)
self.assertEqual(exp_i, x._indices())
self.assertEqual(exp_v, x._values())
# Duplicate indices
i = self.IndexTensor([
[0, 0, 2, 0],
[0, 0, 3, 0],
[0, 0, 4, 0],
])
v = self.ValueTensor([[3, 2, 3], [2, 1, 1], [4, 3, 4], [1, 1, 1]])
x = self.SparseTensor(i, v, torch.Size([3, 4, 5, 3]))
exp_i = self.IndexTensor([
[0, 2],
[0, 3],
[0, 4],
])
exp_v = self.ValueTensor([[6, 4, 5], [4, 3, 4]])
x = self.safeCoalesce(x)
self.assertEqual(exp_i, x._indices())
self.assertEqual(exp_v, x._values())
def test_clone(self):
x, _, _ = self._gen_sparse(4, 20, 5)
if self.is_uncoalesced:
self.assertFalse(x.is_coalesced())
y = x.clone()
self.assertFalse(y.is_coalesced())
x = x.coalesce()
self.assertTrue(x.is_coalesced())
y = x.clone()
self.assertTrue(y.is_coalesced())
def test_transpose(self):
x = self._gen_sparse(4, 20, 5)[0]
y = x.to_dense()
for i, j in itertools.combinations(range(4), 2):
x = x.transpose_(i, j)
y = y.transpose(i, j)
self.assertEqual(x.to_dense(), y)
x = x.transpose(i, j)
y = y.transpose(i, j)
self.assertEqual(x.to_dense(), y)
@cpu_only
def test_mm(self):
def test_shape(di, dj, dk):
x, _, _ = self._gen_sparse(2, 20, [di, dj])
t = torch.randn(di, dk)
y = torch.randn(dj, dk)
alpha = random.random()
beta = random.random()
res = torch.addmm(alpha, t, beta, x, y)
expected = torch.addmm(alpha, t, beta, x.to_dense(), y)
self.assertEqual(res, expected)
res = torch.addmm(t, x, y)
expected = torch.addmm(t, x.to_dense(), y)
self.assertEqual(res, expected)
res = torch.mm(x, y)
expected = torch.mm(x.to_dense(), y)
self.assertEqual(res, expected)
test_shape(10, 100, 100)
test_shape(100, 1000, 200)
test_shape(64, 10000, 300)
@cpu_only
def test_saddmm(self):
def test_shape(di, dj, dk):
x = self._gen_sparse(2, 20, [di, dj])[0]
t = self._gen_sparse(2, 20, [di, dk])[0]
y = torch.randn(dj, dk)
alpha = random.random()
beta = random.random()
res = torch.saddmm(alpha, t, beta, x, y)
expected = torch.addmm(alpha, t.to_dense(), beta, x.to_dense(), y)
self.assertEqual(res.to_dense(), expected)
res = torch.saddmm(t, x, y)
expected = torch.addmm(t.to_dense(), x.to_dense(), y)
self.assertEqual(res.to_dense(), expected)
res = torch.smm(x, y)
expected = torch.mm(x.to_dense(), y)
self.assertEqual(res.to_dense(), expected)
test_shape(7, 5, 3)
test_shape(1000, 100, 100)
test_shape(3000, 64, 300)
def test_dsmm(self):
def test_shape(di, dj, dk):
x = self._gen_sparse(2, 20, [di, dj])[0]
y = self.randn(dj, dk)
res = torch.dsmm(x, y)
expected = torch.mm(x.to_dense(), y)
self.assertEqual(res, expected)
test_shape(7, 5, 3)
test_shape(1000, 100, 100)
test_shape(3000, 64, 300)
def test_hsmm(self):
def test_shape(di, dj, dk):
x = self._gen_sparse(2, 20, [di, dj])[0]
y = self.randn(dj, dk)
res = torch.hsmm(x, y)
expected = torch.mm(x.to_dense(), y)
self.assertEqual(res.to_dense(), expected)
test_shape(7, 5, 3)
test_shape(1000, 100, 100)
test_shape(3000, 64, 300)
def _test_spadd_shape(self, shape_i, shape_v=None):
shape = shape_i + (shape_v or [])
x, _, _ = self._gen_sparse(len(shape_i), 10, shape)
y = self.randn(*shape)
r = random.random()
res = torch.add(y, r, x)
expected = y + r * x.to_dense()
self.assertEqual(res, expected)
# Non contiguous dense tensor
s = list(shape)
s[0] = shape[-1]
s[-1] = shape[0]
y = self.randn(*s)
y.transpose_(0, len(s) - 1)
r = random.random()
res = torch.add(y, r, x)
expected = y + r * x.to_dense()
self.assertEqual(res, expected)
def test_spadd(self):
self._test_spadd_shape([5, 6])
self._test_spadd_shape([10, 10, 10])
self._test_spadd_shape([50, 30, 20])
self._test_spadd_shape([5, 5, 5, 5, 5, 5])
def test_spadd_hybrid(self):
self._test_spadd_shape([5, 6], [2, 3])
self._test_spadd_shape([10, 10, 10], [3])
self._test_spadd_shape([50, 30, 20], [2])
self._test_spadd_shape([5, 5, 5, 5, 5, 5], [2])
def _test_basic_ops_shape(self, shape_i, shape_v=None):
shape = shape_i + (shape_v or [])
x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
y1 = x1 + x2
y2 = x1.clone()
y2.add_(x2)
expected = x1.to_dense() + x2.to_dense()
self.assertEqual(y1.to_dense(), expected)
self.assertEqual(y2.to_dense(), expected)
y1 = x1 - x2
y2 = x1.clone()
y2.sub_(x2)
expected = x1.to_dense() - x2.to_dense()
self.assertEqual(y1.to_dense(), expected)
self.assertEqual(y2.to_dense(), expected)
y1 = x1 * x2
y2 = x1.clone()
y2.mul_(x2)
expected = x1.to_dense() * x2.to_dense()
self.assertEqual(y1.to_dense(), expected)
self.assertEqual(y2.to_dense(), expected)
y1 = x1 * 37.5
y2 = x1.clone()
y2.mul_(37.5)
expected = x1.to_dense() * 37.5
self.assertEqual(y1.to_dense(), expected)
self.assertEqual(y2.to_dense(), expected)
y1 = x1 / 37.5
y2 = x1.clone()
y2.div_(37.5)
expected = x1.to_dense() / 37.5
self.assertEqual(y1.to_dense(), expected)
self.assertEqual(y2.to_dense(), expected)
# TODO: add back inplace support
y1 = x1 ** 2
y2 = x1.clone()
y2 = y2.pow(2)
expected = x1.to_dense() ** 2
self.assertEqual(y1.to_dense(), expected)
self.assertEqual(y2.to_dense(), expected)
y = x1.clone()
y.zero_()
expected = torch.zeros(x1.size())
self.assertEqual(y.to_dense(), expected)
self.assertFalse(x1.is_coalesced())
y = x1.coalesce()
z = x1.coalesce()
self.assertFalse(x1.is_coalesced())
self.assertTrue(y.is_coalesced())
self.assertEqual(x1, y)
# check that coalesce is out of place
y._values().add_(1)
self.assertEqual(z._values() + 1, y._values())
def test_basic_ops(self):
self._test_basic_ops_shape([5, 6])
self._test_basic_ops_shape([10, 10, 10])
self._test_basic_ops_shape([50, 30, 20])
self._test_basic_ops_shape([5, 5, 5, 5, 5, 5])
def test_basic_ops_hybrid(self):
self._test_basic_ops_shape([5, 6], [2, 3])
self._test_basic_ops_shape([10, 10, 10], [3])
self._test_basic_ops_shape([50, 30, 20], [2])
self._test_basic_ops_shape([5, 5, 5, 5, 5, 5], [2])
def _test_sparse_mask_shape(self, shape_i, shape_v=None):
shape = shape_i + (shape_v or [])
x1, _, _ = self._gen_sparse(len(shape_i), 9, shape)
x2, _, _ = self._gen_sparse(len(shape_i), 12, shape)
y1 = x1 + x2
y2 = x1.clone()
y2.add_(x2)
expected = x1.to_dense() + x2.to_dense()
self.assertEqual(y1.to_dense(), expected)
self.assertEqual(y2.to_dense(), expected)
def _test_sparse_mask_fixed(self):
i = self.IndexTensor([
[1, 3, 0, 4],
[2, 1, 2, 3],
])
v = self.ValueTensor([1, 2, 3, 4])
x = self.SparseTensor(i, v, torch.Size([5, 4])).coalesce()
dense = self.ValueTensor([
[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12],
[13, 14, 15, 16],
[17, 18, 19, 20],
])
exp_v = self.ValueTensor([7, 14, 3, 20])
res = dense._sparse_mask(x)
expected = self.SparseTensor(i, exp_v, torch.Size([5, 4]))
self.assertEqual(res, expected)
def test_sparse_mask(self):
self._test_sparse_mask_fixed()
self._test_sparse_mask_shape([5, 6])
self._test_sparse_mask_shape([10, 10, 10])
self._test_sparse_mask_shape([50, 30, 20])
self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5])
def _test_sparse_mask_hybrid_fixed(self):
i = self.IndexTensor([
[1, 3, 0, 4],
[2, 1, 2, 3],
])
v = self.ValueTensor([[1, 2], [2, 3], [3, 4], [4, 5]])
# TODO: This is also testing that, if coalesce is a no-op,
# the indices don't get permuted. I don't know if we actually
# want to give this invariant.
x = self.SparseTensor(i, v, torch.Size([5, 4, 2])).coalesce()
dense = self.ValueTensor([
[[1, 3], [2, 2], [3, 3], [4, 2]],
[[5, 7], [6, 7], [7, 9], [8, 9]],
[[9, 2], [10, 4], [11, 1], [12, 3]],
[[13, 5], [14, 1], [15, 1], [16, 6]],
[[17, 7], [18, 2], [19, 7], [20, 1]],
])
res = dense._sparse_mask(x)
exp_v = self.ValueTensor([[7, 9], [14, 1], [3, 3], [20, 1]])
expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2]))
self.assertEqual(res, expected)
def test_sparse_mask_hybrid(self):
self._test_sparse_mask_hybrid_fixed()
self._test_sparse_mask_shape([5, 6], [2, 3])
self._test_sparse_mask_shape([10, 10, 10], [3])
self._test_sparse_mask_shape([50, 30, 20], [2])
self._test_sparse_mask_shape([5, 5, 5, 5, 5, 5], [2])
@cuda_only
def test_storage_not_null(self):
x = torch.cuda.sparse.FloatTensor(2)
self.assertNotEqual(x.get_device(), -1)
@cuda_only
@unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
def test_same_gpu(self):
i = self.IndexTensor([[2]]).cuda(1)
v = self.ValueTensor([5]).cuda(1)
x = self.SparseTensor(i, v, torch.Size([3]), device=1)
self.assertEqual(x.get_device(), 1)
self.assertEqual(x._values().get_device(), 1)
self.assertEqual(x._indices().get_device(), 1)
x = self.SparseTensor(3, device=1)
self.assertEqual(x.get_device(), 1)
self.assertEqual(x._values().get_device(), 1)
self.assertEqual(x._indices().get_device(), 1)
v = self.ValueTensor([5]).cuda(0)
self.assertRaises(RuntimeError, lambda: self.SparseTensor(i, v, torch.Size([3])))
class TestUncoalescedSparse(TestSparse):
def setUp(self):
super(TestUncoalescedSparse, self).setUp()
self.is_uncoalesced = True
@unittest.skipIf(not TEST_CUDA, 'CUDA not available')
class TestCudaSparse(TestSparse):
def setUp(self):
super(TestCudaSparse, self).setUp()
self.is_cuda = True
self.IndexTensor = torch.cuda.LongTensor
self.ValueTensor = torch.cuda.DoubleTensor
self.SparseTensor = torch.cuda.sparse.DoubleTensor
@unittest.skipIf(not TEST_CUDA, 'CUDA not available')
class TestCudaUncoalescedSparse(TestCudaSparse):
def setUp(self):
super(TestCudaUncoalescedSparse, self).setUp()
self.is_uncoalesced = True
if __name__ == '__main__':
run_tests()

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More