summaryrefslogtreecommitdiff
path: root/fpcsrc/rtl/arm/arm.inc
blob: 8300535aeabe91789c420026baaf0b50c94ec1c3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
{

    This file is part of the Free Pascal run time library.
    Copyright (c) 2003 by the Free Pascal development team.

    Processor dependent implementation for the system unit for
    ARM

    See the file COPYING.FPC, included in this distribution,
    for details about the copyright.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 **********************************************************************}

{$asmmode gas}

{$ifndef FPC_SYSTEM_HAS_MOVE}
{$define FPC_SYSTEM_FPC_MOVE}
{$endif FPC_SYSTEM_HAS_MOVE}

{$ifdef FPC_SYSTEM_FPC_MOVE}
const
  cpu_has_edsp : boolean = false;
  in_edsp_test : boolean = false;
{$endif FPC_SYSTEM_FPC_MOVE}

{$if not(defined(wince)) and not(defined(gba)) and not(defined(nds)) and not(defined(FPUSOFT)) and not(defined(FPULIBGCC))}

{$define FPC_SYSTEM_HAS_SYSINITFPU}
{$if not defined(darwin) and not defined(FPUVFPV2) and not defined(FPUVFPV3)}
Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
begin
  { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
  asm
    rfs r0
    and r0,r0,#0xffe0ffff
    orr r0,r0,#0x00070000
    wfs r0
  end;
end;
{$else}
Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
begin
  { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
  asm
    fmrx r0,fpscr
    // set "round to nearest" mode
    and  r0,r0,#0xff3fffff
    // mask "exception happened" and overflow flags
    and  r0,r0,#0xffffff20
    // mask exception flags
    and  r0,r0,#0xffff40ff    
{$ifndef darwin}
    // Floating point exceptions cause kernel panics on iPhoneOS 2.2.1...

    // disable flush-to-zero mode (IEEE math compliant)
    and  r0,r0,#0xfeffffff
    // enable invalid operation, div-by-zero and overflow exceptions
    orr  r0,r0,#0x00000700
{$endif}
    fmxr fpscr,r0
  end;
end;
{$endif}
{$endif}

procedure fpc_cpuinit;
begin
  { don't let libraries influence the FPU cw set by the host program }
  if not IsLibrary then
    SysInitFPU;
end;

{$ifdef wince}
function _controlfp(new: DWORD; mask: DWORD): DWORD; cdecl; external 'coredll';

{$define FPC_SYSTEM_HAS_SYSRESETFPU}
Procedure SysResetFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
begin
  softfloat_exception_flags:=0;
end;

{$define FPC_SYSTEM_HAS_SYSINITFPU}
Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
begin
  softfloat_exception_mask:=float_flag_underflow or float_flag_inexact or float_flag_denormal;
  { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
  { FPU precision 64 bit, rounding to nearest, affine infinity }
  _controlfp($000C0003, $030F031F);
end;
{$endif wince}

{****************************************************************************
                       stack frame related stuff
****************************************************************************}

{$IFNDEF INTERNAL_BACKTRACE}
{$define FPC_SYSTEM_HAS_GET_FRAME}
function get_frame:pointer;assembler;nostackframe;
asm
  mov    r0,r11
end;
{$ENDIF not INTERNAL_BACKTRACE}

{$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
function get_caller_addr(framebp:pointer):pointer;assembler;
asm
  movs r0,r0
  beq .Lg_a_null
  ldr r0,[r0,#-4]
.Lg_a_null:
end;


{$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
function get_caller_frame(framebp:pointer):pointer;assembler;
asm
  movs r0,r0
  beq .Lgnf_null
  // see comments in arm/cgcpu.pas, g_proc_entry
  ldr r0,[r0,#-12]
.Lgnf_null:
end;


{$define FPC_SYSTEM_HAS_SPTR}
Function Sptr : pointer;assembler;
asm
  mov    r0,sp
end;


{$ifndef FPC_SYSTEM_HAS_FILLCHAR}
{$define FPC_SYSTEM_HAS_FILLCHAR}
Procedure FillChar(var x;count:longint;value:byte);assembler;nostackframe;
asm
        // less than 0?
        cmp     r1,#0
{$if defined(cpuarmv3) or defined(cpuarmv4)}
        movle   pc,lr
{$else}
        bxle    lr
{$endif}
        mov     r3,r0

        orr     r2,r2,r2,lsl #8
        orr     r2,r2,r2,lsl #16

        tst     r3, #3  // Aligned?
        bne     .LFillchar_do_align

.LFillchar_is_aligned:
        subs    r1,r1,#8
        bmi     .LFillchar_less_than_8bytes

        mov     ip,r2
.LFillchar_at_least_8bytes:
        // Do 16 bytes per loop
        // More unrolling is uncessary, as we'll just stall on the write buffers
        stmia   r3!,{r2,ip}
        subs    r1,r1,#8
        stmplia r3!,{r2,ip}
        subpls  r1,r1,#8
        bpl     .LFillchar_at_least_8bytes

.LFillchar_less_than_8bytes:
        // Do the rest
        adds    r1, r1, #8

{$if defined(cpuarmv3) or defined(cpuarmv4)}
        moveq   pc,lr
{$else}
        bxeq    lr
{$endif}

        tst     r1, #4
        strne   r2,[r3],#4
        tst     r1, #2
        strneh  r2,[r3],#2
        tst     r1, #1
        strneb  r2,[r3],#1
{$if defined(cpuarmv3) or defined(cpuarmv4)}
        mov pc,lr
{$else}
        bx  lr
{$endif}

// Special case for unaligned start
// We make a maximum of 3 loops here
.LFillchar_do_align:
        strb r2,[r3],#1
        subs r1, r1, #1
{$if defined(cpuarmv3) or defined(cpuarmv4)}
        moveq pc,lr
{$else}
        bxeq  lr
{$endif}
        tst r3,#3
        bne .LFillchar_do_align
        b .LFillchar_is_aligned
end;
{$endif FPC_SYSTEM_HAS_FILLCHAR}

{$ifndef FPC_SYSTEM_HAS_MOVE}
{$define FPC_SYSTEM_HAS_MOVE}
procedure Move_pld(const source;var dest;count:longint);assembler;nostackframe;
asm
  pld [r0]
  // count <=0 ?
  cmp r2,#0
{$if defined(cpuarmv3) or defined(cpuarmv4)}
  movle pc,lr
{$else}
  bxle  lr
{$endif}
  // overlap?
  cmp r1,r0
  bls .Lnooverlap
  add r3,r0,r2
  cmp r3,r1
  bls .Lnooverlap
  // overlap, copy backward
.Loverlapped:
  subs r2,r2,#1
  ldrb r3,[r0,r2]
  strb r3,[r1,r2]
  bne .Loverlapped
{$if defined(cpuarmv3) or defined(cpuarmv4)}
  mov pc,lr
{$else}
  bx  lr
{$endif}
.Lnooverlap:
  // less then 16 bytes to copy?
  cmp r2,#8
  // yes, the forget about the whole optimizations
  // and do a bytewise copy
  blt .Lbyteloop

  // both aligned?
  orr r3,r0,r1
  tst r3,#3

  bne .Lbyteloop
(*
  // yes, then align
  // alignment to 4 byte boundries is enough
  ldrb ip,[r0],#1
  sub r2,r2,#1
  stb ip,[r1],#1
  tst r3,#2
  bne .Ldifferentaligned
  ldrh ip,[r0],#2
  sub r2,r2,#2
  sth ip,[r1],#2

.Ldifferentaligned
  // qword aligned?
  orrs r3,r0,r1
  tst r3,#7
  bne .Ldwordloop
*)
  pld [r0,#32]
.Ldwordloop:
  sub r2,r2,#4
  ldr r3,[r0],#4
  // preload
  pld [r0,#64]
  cmp r2,#4
  str r3,[r1],#4
  bcs .Ldwordloop
  cmp r2,#0
{$if defined(cpuarmv3) or defined(cpuarmv4)}
  moveq pc,lr
{$else}
  bxeq  lr
{$endif}
.Lbyteloop:
  subs r2,r2,#1
  ldrb r3,[r0],#1
  strb r3,[r1],#1
  bne .Lbyteloop
{$if defined(cpuarmv3) or defined(cpuarmv4)}
  mov pc,lr
{$else}
  bx  lr
{$endif}
end;

procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe;
asm
  // count <=0 ?
  cmp r2,#0
{$if defined(cpuarmv3) or defined(cpuarmv4)}
  movle pc,lr
{$else}
  bxle  lr
{$endif}
  // overlap?
  cmp r1,r0
  bls .Lnooverlap
  add r3,r0,r2
  cmp r3,r1
  bls .Lnooverlap
  // overlap, copy backward
.Loverlapped:
  subs r2,r2,#1
  ldrb r3,[r0,r2]
  strb r3,[r1,r2]
  bne .Loverlapped
{$if defined(cpuarmv3) or defined(cpuarmv4)}
  mov pc,lr
{$else}
  bx  lr
{$endif}
.Lnooverlap:
  // less then 16 bytes to copy?
  cmp r2,#8
  // yes, the forget about the whole optimizations
  // and do a bytewise copy
  blt .Lbyteloop

  // both aligned?
  orr r3,r0,r1
  tst r3,#3

  bne .Lbyteloop
(*
  // yes, then align
  // alignment to 4 byte boundries is enough
  ldrb ip,[r0],#1
  sub r2,r2,#1
  stb ip,[r1],#1
  tst r3,#2
  bne .Ldifferentaligned
  ldrh ip,[r0],#2
  sub r2,r2,#2
  sth ip,[r1],#2

.Ldifferentaligned
  // qword aligned?
  orrs r3,r0,r1
  tst r3,#7
  bne .Ldwordloop
*)
.Ldwordloop:
  sub r2,r2,#4
  ldr r3,[r0],#4
  cmp r2,#4
  str r3,[r1],#4
  bcs .Ldwordloop
  cmp r2,#0
{$if defined(cpuarmv3) or defined(cpuarmv4)}
  moveq pc,lr
{$else}
  bxeq  lr
{$endif}
.Lbyteloop:
  subs r2,r2,#1
  ldrb r3,[r0],#1
  strb r3,[r1],#1
  bne .Lbyteloop
{$if defined(cpuarmv3) or defined(cpuarmv4)}
  mov pc,lr
{$else}
  bx  lr
{$endif}
end;


const
  moveproc : pointer = @move_blended;

procedure Move(const source;var dest;count:longint);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
asm
  ldr ip,.Lmoveproc
  ldr pc,[ip]
.Lmoveproc:
  .long moveproc
end;

{$endif FPC_SYSTEM_HAS_MOVE}

{****************************************************************************
                                 String
****************************************************************************}

{$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
{$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}

{$ifndef FPC_STRTOSHORTSTRINGPROC}
function fpc_shortstr_to_shortstr(len:longint;const sstr:shortstring):shortstring;assembler;nostackframe;[public,alias: 'FPC_SHORTSTR_TO_SHORTSTR'];compilerproc;
{$else}
procedure fpc_shortstr_to_shortstr(out res:shortstring;const sstr:shortstring);assembler;nostackframe;[public,alias: 'FPC_SHORTSTR_TO_SHORTSTR'];compilerproc;
{$endif}
{r0: __RESULT
 r1: len
 r2: sstr}

asm
    ldrb r12,[r2],#1
    cmp  r12,r1
    movgt r12,r1
    strb r12,[r0],#1
    cmp  r12,#6 (* 6 seems to be the break even point. *)
    blt  .LStartTailCopy
    (* Align destination on 32bits. This is the only place where unrolling
       really seems to help, since in the common case, sstr is aligned on
       32 bits, therefore in the common case we need to copy 3 bytes to
       align, i.e. in the case of a loop, you wouldn't branch out early.*)
    rsb  r3,r0,#0
    ands  r3,r3,#3
    sub  r12,r12,r3
    ldrneb r1,[r2],#1
    strneb r1,[r0],#1
    subnes  r3,r3,#1
    ldrneb r1,[r2],#1
    strneb r1,[r0],#1
    subnes  r3,r3,#1
    ldrneb r1,[r2],#1
    strneb r1,[r0],#1
    subnes  r3,r3,#1
.LDoneAlign:
    (* Destination should be aligned now, but source might not be aligned,
       if this is the case, do a byte-per-byte copy. *)
    tst r2,#3
    bne .LStartTailCopy
    (* Start the main copy, 32 bit at a time. *)
    movs r3,r12,lsr #2
    and r12,r12,#3
    beq  .LStartTailCopy
.LNext4bytes:
    (* Unrolling this loop would save a little bit of time for long strings
       (>20 chars), but alas, it hurts for short strings and they are the
       common case.*)
    ldrne r1,[r2],#4
    strne r1,[r0],#4
    subnes  r3,r3,#1
    bne .LNext4bytes
.LStartTailCopy:
    (* Do remaining bytes. *)
    cmp r12,#0
    beq .LDoneTail
.LNextChar3:
    ldrb r1,[r2],#1
    strb r1,[r0],#1
    subs  r12,r12,#1
    bne .LNextChar3
.LDoneTail:
end;

procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);assembler;nostackframe;[public,alias:'FPC_SHORTSTR_ASSIGN'];compilerproc;

{r0: len
 r1: sstr
 r2: dstr}

asm
    ldrb r12,[r1],#1
    cmp  r12,r0
    movgt r12,r0
    strb r12,[r2],#1
    cmp  r12,#6 (* 6 seems to be the break even point. *)
    blt  .LStartTailCopy
    (* Align destination on 32bits. This is the only place where unrolling
       really seems to help, since in the common case, sstr is aligned on
       32 bits, therefore in the common case we need to copy 3 bytes to
       align, i.e. in the case of a loop, you wouldn't branch out early.*)
    rsb  r3,r2,#0
    ands  r3,r3,#3
    sub  r12,r12,r3
    ldrneb r0,[r1],#1
    strneb r0,[r2],#1
    subnes  r3,r3,#1
    ldrneb r0,[r1],#1
    strneb r0,[r2],#1
    subnes  r3,r3,#1
    ldrneb r0,[r1],#1
    strneb r0,[r2],#1
    subnes  r3,r3,#1
.LDoneAlign:
    (* Destination should be aligned now, but source might not be aligned,
       if this is the case, do a byte-per-byte copy. *)
    tst r1,#3
    bne .LStartTailCopy
    (* Start the main copy, 32 bit at a time. *)
    movs r3,r12,lsr #2
    and r12,r12,#3
    beq  .LStartTailCopy
.LNext4bytes:
    (* Unrolling this loop would save a little bit of time for long strings
       (>20 chars), but alas, it hurts for short strings and they are the
       common case.*)
    ldrne r0,[r1],#4
    strne r0,[r2],#4
    subnes  r3,r3,#1
    bne .LNext4bytes
.LStartTailCopy:
    (* Do remaining bytes. *)
    cmp r12,#0
    beq .LDoneTail
.LNextChar3:
    ldrb r0,[r1],#1
    strb r0,[r2],#1
    subs  r12,r12,#1
    bne .LNextChar3
.LDoneTail:
end;
{$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}

{$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_LENGTH}
{$define FPC_SYSTEM_HAS_FPC_PCHAR_LENGTH}
function fpc_Pchar_length(p:Pchar):sizeint;assembler;nostackframe;[public,alias:'FPC_PCHAR_LENGTH'];compilerproc;

asm
    cmp r0,#0
    mov r1,r0
    beq .Ldone
.Lnextchar:
    (*Are we aligned?*)
    tst r1,#3
    bne .Ltest_unaligned    (*No, do byte per byte.*)
    ldr r3,.L01010101
.Ltest_aligned:
    (*Aligned, load 4 bytes at a time.*)
    ldr r12,[r1],#4
    (*Check wether r12 contains a 0 byte.*)
    sub r2,r12,r3
    mvn r12,r12
    and r2,r2,r12
    ands r2,r2,r3,lsl #7    (*r3 lsl 7 = $80808080*)
    beq .Ltest_aligned      (*No 0 byte, repeat.*)
    sub r1,r1,#4
.Ltest_unaligned:
    ldrb r12,[r1],#1
    cmp r12,#1              (*r12<1 same as r12=0, but result in carry flag*)
    bcs .Lnextchar
    (*Dirty trick: we need to subtract 1 extra because we have counted the
      terminating 0, due to the known carry flag sbc can do this.*)
    sbc r0,r1,r0
.Ldone:
{$if defined(cpuarmv3) or defined(cpuarmv4)}
    mov pc,lr
{$else}
    bx  lr
{$endif}
.L01010101:
    .long 0x01010101
end;
{$endif}


var
  fpc_system_lock: longint; export name 'fpc_system_lock';

function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
asm
{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
.Lloop:
  ldrex r1, [r0]
  sub   r1, r1, #1
  strex r2, r1, [r0]
  cmp r2, #0
  bne .Lloop
  mov r0, r1
  bx  lr
{$else}
{$if defined(LINUX) and defined(CPUARMEL)}

  stmfd r13!, {lr}
  mov r2, r0   // kuser_cmpxchg does not clobber r2 by definition
.Latomic_dec_loop:
  ldr r0, [r2]   // Load the current value

  // We expect this to work without looping most of the time
  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
  // loop here again, we have to reload the value. Normaly this just fills the
  // load stall-cycles from the above ldr so in reality we'll not get any additional
  // delays because of this
  // Don't use ldr to load r3 to avoid cacheline trashing
  // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
  // the kuser_cmpxchg entry point
  mvn r3, #0x0000f000
  sub r3, r3, #0x3F

  sub r1, r0, #1 // Decrement value
  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success

  movcs r0, r1	 // We expect that to work most of the time so keep it pipeline friendly
  ldmcsfd r13!, {pc}
  b .Latomic_dec_loop // kuser_cmpxchg sets C flag on error

{$else}
// lock
  ldr r3, .Lfpc_system_lock
  mov r1, #1
.Lloop:
  swp r2, r1, [r3]
  cmp r2, #0
  bne .Lloop
// do the job
  ldr r1, [r0]
  sub r1, r1, #1
  str r1, [r0]
  mov r0, r1
// unlock and return
  str r2, [r3]
  bx  lr

.Lfpc_system_lock:
  .long fpc_system_lock
{$endif}
{$endif}
end;


function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
asm
{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
.Lloop:
  ldrex r1, [r0]
  add   r1, r1, #1
  strex r2, r1, [r0]
  cmp r2, #0
  bne .Lloop
  mov r0, r1
  bx  lr
{$else}
{$if defined(LINUX) and defined(CPUARMEL)}

  stmfd r13!, {lr}
  mov r2, r0   // kuser_cmpxchg does not clobber r2 by definition
.Latomic_inc_loop:
  ldr r0, [r2]   // Load the current value

  // We expect this to work without looping most of the time
  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
  // loop here again, we have to reload the value. Normaly this just fills the
  // load stall-cycles from the above ldr so in reality we'll not get any additional
  // delays because of this
  // Don't use ldr to load r3 to avoid cacheline trashing
  // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
  // the kuser_cmpxchg entry point
  mvn r3, #0x0000f000
  sub r3, r3, #0x3F

  add r1, r0, #1 // Increment value
  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success

  movcs r0, r1	 // We expect that to work most of the time so keep it pipeline friendly
  ldmcsfd r13!, {pc}
  b .Latomic_inc_loop // kuser_cmpxchg sets C flag on error

{$else}
// lock
  ldr r3, .Lfpc_system_lock
  mov r1, #1
.Lloop:
  swp r2, r1, [r3]
  cmp r2, #0
  bne .Lloop
// do the job
  ldr r1, [r0]
  add r1, r1, #1
  str r1, [r0]
  mov r0, r1
// unlock and return
  str r2, [r3]
  bx  lr

.Lfpc_system_lock:
  .long fpc_system_lock
{$endif}
{$endif}
end;


function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
asm
{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
// swp is deprecated on ARMv6 and above
.Lloop:
  ldrex r2, [r0]
  strex r3, r1, [r0]
  cmp r3, #0
  bne .Lloop
  mov r0, r2
  bx  lr
{$else}
  swp r1, r1, [r0]
  mov r0,r1
{$endif}
end;

function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
asm
{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
.Lloop:
  ldrex r2, [r0]
  add   r12, r1, r2
  strex r3, r12, [r0]
  cmp r3, #0
  bne .Lloop
  mov  r0, r2
  bx  lr
{$else}
{$if defined(LINUX) and defined(CPUARMEL)}

  stmfd r13!, {r4, lr}
  mov r2, r0   // kuser_cmpxchg does not clobber r2 by definition
  mov r4, r1   // Save addend
.Latomic_add_loop:
  ldr r0, [r2]   // Load the current value

  // We expect this to work without looping most of the time
  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
  // loop here again, we have to reload the value. Normaly this just fills the
  // load stall-cycles from the above ldr so in reality we'll not get any additional
  // delays because of this
  // Don't use ldr to load r3 to avoid cacheline trashing
  // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
  // the kuser_cmpxchg entry point
  mvn r3, #0x0000f000
  sub r3, r3, #0x3F

  add r1, r0, r4 // Add to value
  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success
  // r1 does not get clobbered, so just get back the original value
  // Otherwise we would have to allocate one more register and store the
  // temporary value
  subcs   r0, r1, r4
  ldmcsfd r13!, {r4, pc}

  b .Latomic_add_loop // kuser_cmpxchg failed, loop back

{$else}
// lock
  ldr r3, .Lfpc_system_lock
  mov r2, #1
.Lloop:
  swp r2, r2, [r3]
  cmp r2, #0
  bne .Lloop
// do the job
  ldr r2, [r0]
  add r1, r1, r2
  str r1, [r0]
  mov r0, r2
// unlock and return
  mov r2, #0
  str r2, [r3]
  bx  lr

.Lfpc_system_lock:
  .long fpc_system_lock
{$endif}
{$endif}
end;


function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
asm
{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
.Lloop:
  ldrex    r3, [r0]
  mov      r12, #0
  cmp      r3, r2
  strexeq  r12, r1, [r0]
  cmp      r12, #0
  bne      .Lloop
  mov      r0, r3
  bx       lr
{$else}
{$if defined(LINUX) and defined(CPUARMEL)}

  stmfd r13!, {r4, lr}
  mvn   r3, #0x0000f000
  sub   r3, r3, #0x3F

  mov   r4, r2 // Swap parameters around
  mov   r2, r0
  mov   r0, r4 // Use r4 because we'll need the new value for later

  // r1 and r2 will not be clobbered by kuser_cmpxchg
  // If we have to loop, r0 will be set to the original Comperand
  .Linterlocked_compare_exchange_loop:

  blx   r3       // Call kuser_cmpxchg sets C-Flag on success
  movcs r0, r4   // Return the previous value on success
  ldmcsfd r13!, {r4, pc}
  // The error case is a bit tricky, kuser_cmpxchg does not return the current value
  // So we may need to loop to avoid race conditions
  // The loop case is HIGHLY unlikely, it would require that we got rescheduled between
  // calling kuser_cmpxchg and the ldr. While beeing rescheduled another process/thread
  // would have the set the value to our comperand
  ldr	r0, [r2] // Load the currently set value
  cmp   r0, r4   // Return if Comperand != current value, otherwise loop again
  ldmnefd r13!, {r4, pc}
  // If we need to loop here, we have to
  b	.Linterlocked_compare_exchange_loop

{$else}
// lock
  ldr r12, .Lfpc_system_lock
  mov r3, #1
.Lloop:
  swp r3, r3, [r12]
  cmp r3, #0
  bne .Lloop
// do the job
  ldr r3, [r0]
  cmp r3, r2
  streq r1, [r0]
  mov r0, r3
// unlock and return
  mov r3, #0
  str r3, [r12]
  bx  lr

.Lfpc_system_lock:
  .long fpc_system_lock
{$endif}
{$endif}
end;

{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
function declocked(var l: longint) : boolean; inline;
begin
  Result:=InterLockedDecrement(l) = 0;
end;

{$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
procedure inclocked(var l: longint); inline;
begin
  InterLockedIncrement(l);
end;

procedure fpc_cpucodeinit;
begin
{$ifdef FPC_SYSTEM_FPC_MOVE}
  cpu_has_edsp:=true;
  in_edsp_test:=true;
  asm
    bic r0,sp,#7
    ldrd r0,[r0]
  end;
  in_edsp_test:=false;
  if cpu_has_edsp then
    moveproc:=@move_pld
  else
    moveproc:=@move_blended;
{$endif FPC_SYSTEM_FPC_MOVE}
end;

{$define FPC_SYSTEM_HAS_SWAPENDIAN}

{ SwapEndian(<16 Bit>) being inlined is faster than using assembler }
function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  begin
    { the extra Word type cast is necessary because the "AValue shr 8" }
    { is turned into "longint(AValue) shr 8", so if AValue < 0 then    }
    { the sign bits from the upper 16 bits are shifted in rather than  }
    { zeroes.                                                          }
    Result := SmallInt((Word(AValue) shr 8) or (Word(AValue) shl 8));
  end;


function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
  begin
    Result := Word((AValue shr 8) or (AValue shl 8));
  end;

(*
This is kept for reference. Thats what the compiler COULD generate in these cases.
But FPC currently does not support inlining of asm-functions, so the whole call-overhead
is bigger than the gain of the optimized function.
function AsmSwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif};assembler;nostackframe;
asm
	// We're starting with 4321
{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
	mov r0, r0, shl #16	// Shift to make that 2100
	mov r0, r0, ror #24	// Rotate to 1002
	orr r0, r0, r0 shr #16  // Shift and combine into 0012
{$else}
	rev r0, r0		// Reverse byteorder    r0 = 1234
	mov r0, r0, shr #16	// Shift down to 16bits r0 = 0012
{$endif}
end;

*)

function SwapEndian(const AValue: LongInt): LongInt;assembler;nostackframe;
asm
        // We're starting with r0 = 4321
{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
        mov r2, r0, lsr #24             // r2 = 0004
        and r1, r0, #16711680           // r1 = 0300
        orr r2, r2, r0, lsl #24         // r2 = 1004
        orr r2, r2, r1, lsr #8          // r2 = 1034
        and r0, r0, #65280              // r0 = 0020
        orr r0, r2, r0, lsl #8          // r0 = 1234
{$else}
	rev r0, r0
{$endif}
end;

function SwapEndian(const AValue: DWord): DWord;assembler;nostackframe;
asm
        // We're starting with r0 = 4321
{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
        mov r2, r0, lsr #24             // r2 = 0004
        and r1, r0, #16711680           // r1 = 0300
        orr r2, r2, r0, lsl #24         // r2 = 1004
        orr r2, r2, r1, lsr #8          // r2 = 1034
        and r0, r0, #65280              // r0 = 0020
        orr r0, r2, r0, lsl #8          // r0 = 1234
{$else}
	rev r0, r0
{$endif}
end;

function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
asm
        // We're starting with r0 = 4321 r1 = 8765
{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
        mov ip, r1

        mov r2, r0, lsr #24             // r2 = 0004
        and r3, r0, #16711680           // r3 = 0300
        orr r2, r2, r0, lsl #24         // r2 = 1004
        orr r2, r2, r3, lsr #8          // r2 = 1034
        and r0, r0, #65280              // r0 = 0020
        orr r1, r2, r0, lsl #8          // r1 = 1234

        mov r2, ip, lsr #24             // r2 = 0008
        and r3, ip, #16711680           // r1 = 0700
        orr r2, r2, ip, lsl #24         // r2 = 5008
        orr r2, r2, r3, lsr #8          // r2 = 5078
        and ip, ip, #65280              // ip = 0060
        orr r0, r2, ip, lsl #8          // r0 = 5678
        bx lr
{$else}
	rev r2, r0
	rev r0, r1
	mov r1, r2
{$endif}
end;

function SwapEndian(const AValue: QWord): QWord; assembler; nostackframe;
asm
        // We're starting with r0 = 4321 r1 = 8765
{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
        mov ip, r1

        mov r2, r0, lsr #24             // r2 = 0004
        and r3, r0, #16711680           // r3 = 0300
        orr r2, r2, r0, lsl #24         // r2 = 1004
        orr r2, r2, r3, lsr #8          // r2 = 1034
        and r0, r0, #65280              // r0 = 0020
        orr r1, r2, r0, lsl #8          // r1 = 1234

        mov r2, ip, lsr #24             // r2 = 0008
        and r3, ip, #16711680           // r1 = 0700
        orr r2, r2, ip, lsl #24         // r2 = 5008
        orr r2, r2, r3, lsr #8          // r2 = 5078
        and ip, ip, #65280              // ip = 0060
        orr r0, r2, ip, lsl #8          // r0 = 5678
        bx lr
{$else}
	rev r2, r0
	rev r0, r1
	mov r1, r2
{$endif}
end;

{include hand-optimized assembler division code}
{$i divide.inc}