1 /*
2  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
3  * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <linux/slab.h>
35 #include <linux/module.h>
36 #include <linux/sched.h>
37 
38 #ifdef __linux__
39 #include <linux/proc_fs.h>
40 #include <linux/cred.h>
41 #endif
42 
43 #include "mlx4_ib.h"
44 
convert_access(int acc)45 static u32 convert_access(int acc)
46 {
47 	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
48 	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
49 	       (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
50 	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
51 	       MLX4_PERM_LOCAL_READ;
52 }
53 #ifdef __linux__
shared_mr_proc_read(struct file * file,char __user * buffer,size_t len,loff_t * offset)54 static ssize_t shared_mr_proc_read(struct file *file,
55 			  char __user *buffer,
56 			  size_t len,
57 			  loff_t *offset)
58 {
59 
60 	return -ENOSYS;
61 
62 }
63 
shared_mr_proc_write(struct file * file,const char __user * buffer,size_t len,loff_t * offset)64 static ssize_t shared_mr_proc_write(struct file *file,
65 			   const char __user *buffer,
66 			   size_t len,
67 			   loff_t *offset)
68 {
69 
70 	return -ENOSYS;
71 }
72 
shared_mr_mmap(struct file * filep,struct vm_area_struct * vma)73 static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma)
74 {
75 
76 	struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode);
77 	struct mlx4_shared_mr_info *smr_info =
78 		(struct mlx4_shared_mr_info *)pde->data;
79 
80 	/* Prevent any mapping not on start of area */
81 	if (vma->vm_pgoff != 0)
82 		return -EINVAL;
83 
84 	return ib_umem_map_to_vma(smr_info->umem,
85 					vma);
86 
87 }
88 
89 static const struct file_operations shared_mr_proc_ops = {
90 	.owner	= THIS_MODULE,
91 	.read	= shared_mr_proc_read,
92 	.write	= shared_mr_proc_write,
93 	.mmap	= shared_mr_mmap
94 };
95 
convert_shared_access(int acc)96 static mode_t convert_shared_access(int acc)
97 {
98 
99 	return (acc & IB_ACCESS_SHARED_MR_USER_READ ? S_IRUSR       : 0) |
100 	       (acc & IB_ACCESS_SHARED_MR_USER_WRITE  ? S_IWUSR : 0) |
101 	       (acc & IB_ACCESS_SHARED_MR_GROUP_READ   ? S_IRGRP  : 0) |
102 	       (acc & IB_ACCESS_SHARED_MR_GROUP_WRITE   ? S_IWGRP  : 0) |
103 	       (acc & IB_ACCESS_SHARED_MR_OTHER_READ   ? S_IROTH  : 0) |
104 	       (acc & IB_ACCESS_SHARED_MR_OTHER_WRITE   ? S_IWOTH  : 0);
105 
106 }
107 #endif
mlx4_ib_get_dma_mr(struct ib_pd * pd,int acc)108 struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
109 {
110 	struct mlx4_ib_mr *mr;
111 	int err;
112 
113 	mr = kzalloc(sizeof *mr, GFP_KERNEL);
114 	if (!mr)
115 		return ERR_PTR(-ENOMEM);
116 
117 	err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
118 			    ~0ull, convert_access(acc), 0, 0, &mr->mmr);
119 	if (err)
120 		goto err_free;
121 
122 	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
123 	if (err)
124 		goto err_mr;
125 
126 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
127 	mr->umem = NULL;
128 
129 	return &mr->ibmr;
130 
131 err_mr:
132 	mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
133 
134 err_free:
135 	kfree(mr);
136 
137 	return ERR_PTR(err);
138 }
139 
mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev * dev,struct mlx4_mtt * mtt,u64 mtt_size,u64 mtt_shift,u64 len,u64 cur_start_addr,u64 * pages,int * start_index,int * npages)140 static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
141 						struct mlx4_mtt *mtt,
142 						u64 mtt_size,
143 						u64 mtt_shift,
144 						u64 len,
145 						u64 cur_start_addr,
146 						u64 *pages,
147 						int *start_index,
148 						int *npages)
149 {
150 	int k;
151 	int err = 0;
152 	u64 mtt_entries;
153 	u64 cur_end_addr = cur_start_addr + len;
154 	u64 cur_end_addr_aligned = 0;
155 
156 	len += (cur_start_addr & (mtt_size-1ULL));
157 	cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
158 	len += (cur_end_addr_aligned - cur_end_addr);
159 	if (len & (mtt_size-1ULL)) {
160 		WARN(1 ,
161 		"write_block: len %llx is not aligned to mtt_size %llx\n",
162 			(long long)len, (long long)mtt_size);
163 		return -EINVAL;
164 	}
165 
166 
167 	mtt_entries = (len >> mtt_shift);
168 
169 	/* Align the MTT start address to
170 		the mtt_size.
171 		Required to handle cases when the MR
172 		starts in the middle of an MTT record.
173 		Was not required in old code since
174 		the physical addresses provided by
175 		the dma subsystem were page aligned,
176 		which was also the MTT size.
177 	*/
178 	cur_start_addr = round_down(cur_start_addr, mtt_size);
179 	/* A new block is started ...*/
180 	for (k = 0; k < mtt_entries; ++k) {
181 		pages[*npages] = cur_start_addr + (mtt_size * k);
182 		(*npages)++;
183 		/*
184 		 * Be friendly to mlx4_write_mtt() and
185 		 * pass it chunks of appropriate size.
186 		 */
187 		if (*npages == PAGE_SIZE / sizeof(u64)) {
188 			err = mlx4_write_mtt(dev->dev,
189 					mtt, *start_index,
190 					*npages, pages);
191 			if (err)
192 				return err;
193 
194 			(*start_index) += *npages;
195 			*npages = 0;
196 		}
197 	}
198 
199 	return 0;
200 }
201 
mlx4_ib_umem_write_mtt(struct mlx4_ib_dev * dev,struct mlx4_mtt * mtt,struct ib_umem * umem)202 int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
203 			   struct ib_umem *umem)
204 {
205 	u64 *pages;
206 	struct ib_umem_chunk *chunk;
207 	int j;
208 	u64 len = 0;
209 	int err = 0;
210 	u64 mtt_size;
211 	u64 cur_start_addr = 0;
212 	u64 mtt_shift;
213 	int start_index = 0;
214 	int npages = 0;
215 
216 	pages = (u64 *) __get_free_page(GFP_KERNEL);
217 	if (!pages)
218 		return -ENOMEM;
219 
220 	mtt_shift = mtt->page_shift;
221 	mtt_size = 1ULL << mtt_shift;
222 
223 	list_for_each_entry(chunk, &umem->chunk_list, list)
224 		for (j = 0; j < chunk->nmap; ++j) {
225 			if (cur_start_addr + len ==
226 			    sg_dma_address(&chunk->page_list[j])) {
227 				/* still the same block */
228 				len += sg_dma_len(&chunk->page_list[j]);
229 				continue;
230 			}
231 			/* A new block is started ...*/
232 			/* If len is malaligned, write an extra mtt entry to
233 			    cover the misaligned area (round up the division)
234 			*/
235 			err = mlx4_ib_umem_write_mtt_block(dev,
236 						mtt, mtt_size, mtt_shift,
237 						len, cur_start_addr,
238 						pages,
239 						&start_index,
240 						&npages);
241 			if (err)
242 				goto out;
243 
244 			cur_start_addr =
245 				sg_dma_address(&chunk->page_list[j]);
246 			len = sg_dma_len(&chunk->page_list[j]);
247 		}
248 
249 	/* Handle the last block */
250 	if (len > 0) {
251 		/*  If len is malaligned, write an extra mtt entry to cover
252 		     the misaligned area (round up the division)
253 		*/
254 		err = mlx4_ib_umem_write_mtt_block(dev,
255 						mtt, mtt_size, mtt_shift,
256 						len, cur_start_addr,
257 						pages,
258 						&start_index,
259 						&npages);
260 			if (err)
261 				goto out;
262 	}
263 
264 
265 	if (npages)
266 		err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
267 
268 out:
269 	free_page((unsigned long) pages);
270 	return err;
271 }
272 
alignment_of(u64 ptr)273 static inline u64 alignment_of(u64 ptr)
274 {
275 	return ilog2(ptr & (~(ptr-1)));
276 }
277 
mlx4_ib_umem_calc_block_mtt(u64 next_block_start,u64 current_block_end,u64 block_shift)278 static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
279 						u64 current_block_end,
280 						u64 block_shift)
281 {
282 	/* Check whether the alignment of the new block
283 	     is aligned as well as the previous block.
284 	     Block address must start with zeros till size of entity_size.
285 	*/
286 	if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
287 		/* It is not as well aligned as the
288 		previous block-reduce the mtt size
289 		accordingly.
290 		Here we take the last right bit
291 		which is 1.
292 		*/
293 		block_shift = alignment_of(next_block_start);
294 
295 	/*  Check whether the alignment of the
296 	     end of previous block - is it aligned
297 	     as well as the start of the block
298 	*/
299 	if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
300 		/* It is not as well aligned as
301 		the start of the block - reduce the
302 		mtt size accordingly.
303 		*/
304 		block_shift = alignment_of(current_block_end);
305 
306 	return block_shift;
307 }
308 
309 /* Calculate optimal mtt size based on contiguous pages.
310 * Function will return also the number of pages that are not aligned to the
311    calculated mtt_size to be added to total number
312     of pages. For that we should check the first chunk length & last chunk
313     length and if not aligned to mtt_size we should increment
314     the non_aligned_pages number.
315     All chunks in the middle already handled as part of mtt shift calculation
316     for both their start & end addresses.
317 */
mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem * umem,u64 start_va,int * num_of_mtts)318 int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
319 						u64 start_va,
320 						int *num_of_mtts)
321 {
322 	struct ib_umem_chunk *chunk;
323 	int j;
324 	u64 block_shift = MLX4_MAX_MTT_SHIFT;
325 	u64 current_block_len = 0;
326 	u64 current_block_start = 0;
327 	u64 misalignment_bits;
328 	u64 first_block_start = 0;
329 	u64 last_block_end = 0;
330 	u64 total_len = 0;
331 	u64 last_block_aligned_end = 0;
332 	u64 min_shift = ilog2(umem->page_size);
333 
334 	list_for_each_entry(chunk, &umem->chunk_list, list) {
335 		/* Initialization - save the first chunk start as
336 		    the current_block_start - block means contiguous pages.
337 		*/
338 		if (current_block_len == 0 && current_block_start == 0) {
339 			first_block_start = current_block_start =
340 				sg_dma_address(&chunk->page_list[0]);
341 			/* Find the bits that are different between
342 			    the physical address and the virtual
343 			    address for the start of the MR.
344 			*/
345 			/* umem_get aligned the start_va to a page
346 			   boundry. Therefore, we need to align the
347 			   start va to the same boundry */
348 			/* misalignment_bits is needed to handle the
349 			   case of a single memory region. In this
350 			   case, the rest of the logic will not reduce
351 			   the block size.  If we use a block size
352 			   which is bigger than the alignment of the
353 			   misalignment bits, we might use the virtual
354 			   page number instead of the physical page
355 			   number, resulting in access to the wrong
356 			   data. */
357 			misalignment_bits =
358 			(start_va & (~(((u64)(umem->page_size))-1ULL)))
359 						^ current_block_start;
360 			block_shift = min(alignment_of(misalignment_bits)
361 				, block_shift);
362 		}
363 
364 		/* Go over the scatter entries in the current chunk, check
365 		     if they continue the previous scatter entry.
366 		*/
367 		for (j = 0; j < chunk->nmap; ++j) {
368 			u64 next_block_start =
369 				sg_dma_address(&chunk->page_list[j]);
370 			u64 current_block_end = current_block_start
371 				+ current_block_len;
372 			/* If we have a split (non-contig.) between two block*/
373 			if (current_block_end != next_block_start) {
374 				block_shift = mlx4_ib_umem_calc_block_mtt(
375 						next_block_start,
376 						current_block_end,
377 						block_shift);
378 
379 				/* If we reached the minimum shift for 4k
380 				     page we stop the loop.
381 				*/
382 				if (block_shift <= min_shift)
383 					goto end;
384 
385 				/* If not saved yet we are in first block -
386 				     we save the length of first block to
387 				     calculate the non_aligned_pages number at
388 				*    the end.
389 				*/
390 				total_len += current_block_len;
391 
392 				/* Start a new block */
393 				current_block_start = next_block_start;
394 				current_block_len =
395 					sg_dma_len(&chunk->page_list[j]);
396 				continue;
397 			}
398 			/* The scatter entry is another part of
399 			     the current block, increase the block size
400 			* An entry in the scatter can be larger than
401 			4k (page) as of dma mapping
402 			which merge some blocks together.
403 			*/
404 			current_block_len +=
405 				sg_dma_len(&chunk->page_list[j]);
406 		}
407 	}
408 
409 	/* Account for the last block in the total len */
410 	total_len += current_block_len;
411 	/* Add to the first block the misalignment that it suffers from.*/
412 	total_len += (first_block_start & ((1ULL<<block_shift)-1ULL));
413 	last_block_end = current_block_start+current_block_len;
414 	last_block_aligned_end = round_up(last_block_end, 1<<block_shift);
415 	total_len += (last_block_aligned_end - last_block_end);
416 
417 	WARN((total_len & ((1ULL<<block_shift)-1ULL)),
418 		" misaligned total length detected (%llu, %llu)!",
419 		(long long)total_len, (long long)block_shift);
420 
421 	*num_of_mtts = total_len >> block_shift;
422 end:
423 	if (block_shift < min_shift) {
424 		/* If shift is less than the min we set a WARN and
425 		     return the min shift.
426 		*/
427 		WARN(1,
428 		"mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
429 		(long long)block_shift);
430 
431 		block_shift = min_shift;
432 	}
433 	return block_shift;
434 }
435 
436 #ifdef __linux__
prepare_shared_mr(struct mlx4_ib_mr * mr,int access_flags,int mr_id)437 static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id)
438 {
439 	struct proc_dir_entry *mr_proc_entry;
440 	mode_t mode = S_IFREG;
441 	char name_buff[16];
442 
443 	mode |= convert_shared_access(access_flags);
444 	sprintf(name_buff, "%X", mr_id);
445 	mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL);
446 	mr->smr_info->mr_id = mr_id;
447 	mr->smr_info->umem = mr->umem;
448 
449 	mr_proc_entry = proc_create_data(name_buff, mode,
450 				mlx4_mrs_dir_entry,
451 				&shared_mr_proc_ops,
452 				mr->smr_info);
453 
454 	if (!mr_proc_entry) {
455 		pr_err("prepare_shared_mr failed via proc\n");
456 		kfree(mr->smr_info);
457 		return -ENODEV;
458 	}
459 
460 	current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid));
461 	mr_proc_entry->size = mr->umem->length;
462 	return 0;
463 
464 }
is_shared_mr(int access_flags)465 static int is_shared_mr(int access_flags)
466 {
467 	/* We should check whether IB_ACCESS_SHARED_MR_USER_READ or
468 	other shared bits were turned on.
469 	*/
470 	return !!(access_flags & (IB_ACCESS_SHARED_MR_USER_READ |
471 				IB_ACCESS_SHARED_MR_USER_WRITE |
472 				IB_ACCESS_SHARED_MR_GROUP_READ |
473 				IB_ACCESS_SHARED_MR_GROUP_WRITE |
474 				IB_ACCESS_SHARED_MR_OTHER_READ |
475 				IB_ACCESS_SHARED_MR_OTHER_WRITE));
476 
477 }
478 #endif
479 
mlx4_ib_reg_user_mr(struct ib_pd * pd,u64 start,u64 length,u64 virt_addr,int access_flags,struct ib_udata * udata,int mr_id)480 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
481 				  u64 virt_addr, int access_flags,
482 				  struct ib_udata *udata,
483 				  int mr_id)
484 {
485 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
486 	struct mlx4_ib_mr *mr;
487 	int shift;
488 	int err;
489 	int n;
490 
491 	mr = kzalloc(sizeof *mr, GFP_KERNEL);
492 	if (!mr)
493 		return ERR_PTR(-ENOMEM);
494 
495 	mr->umem = ib_umem_get(pd->uobject->context, start, length,
496 			access_flags, 0);
497 	if (IS_ERR(mr->umem)) {
498 		err = PTR_ERR(mr->umem);
499 		goto err_free;
500 	}
501 
502 	n = ib_umem_page_count(mr->umem);
503 	shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
504 		&n);
505 	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
506 			 convert_access(access_flags), n, shift, &mr->mmr);
507 	if (err)
508 		goto err_umem;
509 
510 	err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
511 	if (err)
512 		goto err_mr;
513 
514 	err = mlx4_mr_enable(dev->dev, &mr->mmr);
515 	if (err)
516 		goto err_mr;
517 
518 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
519 #ifdef __linux__
520 	/* Check whether MR should be shared */
521 	if (is_shared_mr(access_flags)) {
522 	/* start address and length must be aligned to page size in order
523 	    to map a full page and preventing leakage of data */
524 		if (mr->umem->offset || (length & ~PAGE_MASK)) {
525 		        err = -EINVAL;
526 		        goto err_mr;
527 		}
528 
529 		err = prepare_shared_mr(mr, access_flags, mr_id);
530 		if (err)
531 			goto err_mr;
532 	}
533 #endif
534 	return &mr->ibmr;
535 
536 err_mr:
537 	mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
538 
539 err_umem:
540 	ib_umem_release(mr->umem);
541 
542 err_free:
543 	kfree(mr);
544 
545 	return ERR_PTR(err);
546 }
547 
548 
mlx4_ib_dereg_mr(struct ib_mr * ibmr)549 int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
550 {
551 	struct mlx4_ib_mr *mr = to_mmr(ibmr);
552 
553 	mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
554 	if (mr->smr_info) {
555 		/* When master/parent shared mr is dereged there is
556 		no ability to share this mr any more - its mr_id will be
557 		returned to the kernel as part of ib_uverbs_dereg_mr
558 		and may be allocated again as part of other reg_mr.
559 		*/
560 		char name_buff[16];
561 
562 		sprintf(name_buff, "%X", mr->smr_info->mr_id);
563 		/* Remove proc entry is checking internally that no operation
564 		    was strated on that proc fs file and if in the middle
565 		    current process will wait till end of operation.
566 		    That's why no sync mechanism is needed when we release
567 		    below the shared umem.
568 		*/
569 #ifdef __linux__
570 		remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
571 		kfree(mr->smr_info);
572 #endif
573 	}
574 
575 	if (mr->umem)
576 		ib_umem_release(mr->umem);
577 
578 	kfree(mr);
579 
580 	return 0;
581 }
582 
mlx4_ib_alloc_fast_reg_mr(struct ib_pd * pd,int max_page_list_len)583 struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
584 					int max_page_list_len)
585 {
586 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
587 	struct mlx4_ib_mr *mr;
588 	int err;
589 
590 	mr = kzalloc(sizeof *mr, GFP_KERNEL);
591 	if (!mr)
592 		return ERR_PTR(-ENOMEM);
593 
594 	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
595 			    max_page_list_len, 0, &mr->mmr);
596 	if (err)
597 		goto err_free;
598 
599 	err = mlx4_mr_enable(dev->dev, &mr->mmr);
600 	if (err)
601 		goto err_mr;
602 
603 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
604 	mr->umem = NULL;
605 
606 	return &mr->ibmr;
607 
608 err_mr:
609 	mlx4_mr_free(dev->dev, &mr->mmr);
610 
611 err_free:
612 	kfree(mr);
613 	return ERR_PTR(err);
614 }
615 
mlx4_ib_alloc_fast_reg_page_list(struct ib_device * ibdev,int page_list_len)616 struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
617 							       int page_list_len)
618 {
619 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
620 	struct mlx4_ib_fast_reg_page_list *mfrpl;
621 	int size = page_list_len * sizeof (u64);
622 
623 	if (page_list_len > MLX4_MAX_FAST_REG_PAGES)
624 		return ERR_PTR(-EINVAL);
625 
626 	mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
627 	if (!mfrpl)
628 		return ERR_PTR(-ENOMEM);
629 
630 	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
631 	if (!mfrpl->ibfrpl.page_list)
632 		goto err_free;
633 
634 	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
635 						     size, &mfrpl->map,
636 						     GFP_KERNEL);
637 	if (!mfrpl->mapped_page_list)
638 		goto err_free;
639 
640 	WARN_ON(mfrpl->map & 0x3f);
641 
642 	return &mfrpl->ibfrpl;
643 
644 err_free:
645 	kfree(mfrpl->ibfrpl.page_list);
646 	kfree(mfrpl);
647 	return ERR_PTR(-ENOMEM);
648 }
649 
mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list * page_list)650 void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
651 {
652 	struct mlx4_ib_dev *dev = to_mdev(page_list->device);
653 	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
654 	int size = page_list->max_page_list_len * sizeof (u64);
655 
656 	dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
657 			  mfrpl->map);
658 	kfree(mfrpl->ibfrpl.page_list);
659 	kfree(mfrpl);
660 }
661 
mlx4_ib_fmr_alloc(struct ib_pd * pd,int acc,struct ib_fmr_attr * fmr_attr)662 struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
663 				 struct ib_fmr_attr *fmr_attr)
664 {
665 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
666 	struct mlx4_ib_fmr *fmr;
667 	int err = -ENOMEM;
668 
669 	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
670 	if (!fmr)
671 		return ERR_PTR(-ENOMEM);
672 
673 	err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
674 			     fmr_attr->max_pages, fmr_attr->max_maps,
675 			     fmr_attr->page_shift, &fmr->mfmr);
676 	if (err)
677 		goto err_free;
678 
679 	err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
680 	if (err)
681 		goto err_mr;
682 
683 	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
684 
685 	return &fmr->ibfmr;
686 
687 err_mr:
688 	mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
689 
690 err_free:
691 	kfree(fmr);
692 
693 	return ERR_PTR(err);
694 }
695 
mlx4_ib_map_phys_fmr(struct ib_fmr * ibfmr,u64 * page_list,int npages,u64 iova)696 int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
697 		      int npages, u64 iova)
698 {
699 	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
700 	struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
701 
702 	return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
703 				 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
704 }
705 
mlx4_ib_unmap_fmr(struct list_head * fmr_list)706 int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
707 {
708 	struct ib_fmr *ibfmr;
709 	int err;
710 	struct mlx4_dev *mdev = NULL;
711 
712 	list_for_each_entry(ibfmr, fmr_list, list) {
713 		if (mdev && to_mdev(ibfmr->device)->dev != mdev)
714 			return -EINVAL;
715 		mdev = to_mdev(ibfmr->device)->dev;
716 	}
717 
718 	if (!mdev)
719 		return 0;
720 
721 	list_for_each_entry(ibfmr, fmr_list, list) {
722 		struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
723 
724 		mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
725 	}
726 
727 	/*
728 	 * Make sure all MPT status updates are visible before issuing
729 	 * SYNC_TPT firmware command.
730 	 */
731 	wmb();
732 
733 	err = mlx4_SYNC_TPT(mdev);
734 	if (err)
735 		pr_warn("SYNC_TPT error %d when "
736 		       "unmapping FMRs\n", err);
737 
738 	return 0;
739 }
740 
mlx4_ib_fmr_dealloc(struct ib_fmr * ibfmr)741 int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
742 {
743 	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
744 	struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
745 	int err;
746 
747 	err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
748 
749 	if (!err)
750 		kfree(ifmr);
751 
752 	return err;
753 }
754