@@ -49,7 +49,8 @@ func Adapter(channels: [Int], numRepeat: Int) -> ((PythonObject) -> Void, Model)
4949 var  outs  =  [ Model . IO] ( ) 
5050 for  (i,  channel)    in  channels. enumerated ( )  { 
5151 for  j  in  0 ..< numRepeat { 
52-  let  ( skipModel,  inLayerConv2d,  outLayerConv2d,  resnetBlock)  =  ResnetBlock ( outChannels:  channel,  inConv:  previousChannel !=  channel) 
52+  let  ( skipModel,  inLayerConv2d,  outLayerConv2d,  resnetBlock)  =  ResnetBlock ( 
53+  outChannels:  channel,  inConv:  previousChannel !=  channel) 
5354 previousChannel =  channel
5455 out =  resnetBlock ( out) 
5556 let  reader :  ( PythonObject )  ->  Void  =  {  state_dict in 
@@ -107,9 +108,12 @@ func ResnetBlockLight(outChannels: Int) -> (
107108 ) 
108109} 
109110
110- func  Extractor( prefix:  String ,  channel:  Int ,  innerChannel:  Int ,  numRepeat:  Int ,  downsample:  Bool )  ->  ( ( PythonObject )  ->  Void ,  Model )  { 
111+ func  Extractor( prefix:  String ,  channel:  Int ,  innerChannel:  Int ,  numRepeat:  Int ,  downsample:  Bool ) 
112+  ->  ( ( PythonObject )  ->  Void ,  Model ) 
113+ { 
111114 let  x  =  Input ( ) 
112-  let  inConv  =  Convolution ( groups:  1 ,  filters:  innerChannel,  filterSize:  [ 1 ,  1 ] ,  hint:  Hint ( stride:  [ 1 ,  1 ] ) ) 
115+  let  inConv  =  Convolution ( 
116+  groups:  1 ,  filters:  innerChannel,  filterSize:  [ 1 ,  1 ] ,  hint:  Hint ( stride:  [ 1 ,  1 ] ) ) 
113117 var  out  =  inConv ( x) 
114118 var  readers  =  [ ( PythonObject )  ->  Void ] ( ) 
115119 for  i  in  0 ..< numRepeat { 
@@ -127,7 +131,8 @@ func Extractor(prefix: String, channel: Int, innerChannel: Int, numRepeat: Int,
127131 } 
128132 readers. append ( reader) 
129133 } 
130-  let  outConv  =  Convolution ( groups:  1 ,  filters:  channel,  filterSize:  [ 1 ,  1 ] ,  hint:  Hint ( stride:  [ 1 ,  1 ] ) ) 
134+  let  outConv  =  Convolution ( 
135+  groups:  1 ,  filters:  channel,  filterSize:  [ 1 ,  1 ] ,  hint:  Hint ( stride:  [ 1 ,  1 ] ) ) 
131136 out =  outConv ( out) 
132137 if  downsample { 
133138 let  downsample  =  AveragePool ( filterSize:  [ 2 ,  2 ] ,  hint:  Hint ( stride:  [ 2 ,  2 ] ) ) 
@@ -155,7 +160,9 @@ func AdapterLight(channels: [Int], numRepeat: Int) -> ((PythonObject) -> Void, M
155160 var  out :  Model . IO  =  x
156161 var  outs  =  [ Model . IO] ( ) 
157162 for  (i,  channel)    in  channels. enumerated ( )  { 
158-  let  ( reader,  extractor)  =  Extractor ( prefix:  " \( i) " ,  channel:  channel,  innerChannel:  channel /  4 ,  numRepeat:  numRepeat,  downsample:  i !=  0 ) 
163+  let  ( reader,  extractor)  =  Extractor ( 
164+  prefix:  " \( i) " ,  channel:  channel,  innerChannel:  channel /  4 ,  numRepeat:  numRepeat, 
165+  downsample:  i !=  0 ) 
159166 out =  extractor ( out) 
160167 outs. append ( out) 
161168 readers. append ( reader) 
@@ -246,14 +253,16 @@ func CLIPResidualAttentionBlock(prefix: String, k: Int, h: Int, b: Int, t: Int)
246253 return  ( reader,  Model ( [ x] ,  [ out] ) ) 
247254} 
248255
249- func  StyleAdapter( width:  Int ,  outputDim:  Int ,  layers:  Int ,  heads:  Int ,  tokens:  Int ,  batchSize:  Int )  ->  ( ( PythonObject )  ->  Void ,  Model ) 
256+ func  StyleAdapter( width:  Int ,  outputDim:  Int ,  layers:  Int ,  heads:  Int ,  tokens:  Int ,  batchSize:  Int ) 
257+  ->  ( ( PythonObject )  ->  Void ,  Model ) 
250258{ 
251259 let  x  =  Input ( ) 
252260 let  lnPre  =  LayerNorm ( epsilon:  1e-5 ,  axis:  [ 2 ] ) 
253261 var  out  =  lnPre ( x) 
254262 var  readers  =  [ ( PythonObject )  ->  Void ] ( ) 
255263 for  i  in  0 ..< layers { 
256-  let  ( reader,  block)  =  CLIPResidualAttentionBlock ( prefix:  " transformer_layes. \( i) " ,  k:  width /  heads,  h:  heads,  b:  batchSize,  t:  257  +  tokens) 
264+  let  ( reader,  block)  =  CLIPResidualAttentionBlock ( 
265+  prefix:  " transformer_layes. \( i) " ,  k:  width /  heads,  h:  heads,  b:  batchSize,  t:  257  +  tokens) 
257266 out =  block ( out. reshaped ( [ batchSize,  257  +  tokens,  width] ) ) 
258267 readers. append ( reader) 
259268 } 
@@ -295,21 +304,26 @@ let hint = torch.randn([2, 3, 512, 512])
295304// let adapter = ldm_modules_encoders_adapter.Adapter(cin: 64, channels: [320, 640, 1280, 1280], nums_rb: 2, ksize: 1, sk: true, use_conv: false).to(torch.device("cpu"))
296305// let adapterLight = ldm_modules_encoders_adapter.Adapter_light(cin: 64 * 3, channels: [320, 640, 1280, 1280], nums_rb: 4).to(torch.device("cpu"))
297306let  style  =  torch. randn ( [ 1 ,  257 ,  1024 ] ) 
298- let  styleAdapter  =  ldm_modules_encoders_adapter. StyleAdapter ( width:  1024 ,  context_dim:  768 ,  num_head:  8 ,  n_layes:  3 ,  num_token:  8 ) . to ( torch. device ( " cpu " ) ) 
299- styleAdapter. load_state_dict ( torch. load ( " /home/liu/workspace/T2I-Adapter/models/t2iadapter_style_sd14v1.pth " ) ) 
307+ let  styleAdapter  =  ldm_modules_encoders_adapter. StyleAdapter ( 
308+  width:  1024 ,  context_dim:  768 ,  num_head:  8 ,  n_layes:  3 ,  num_token:  8 
309+ ) . to ( torch. device ( " cpu " ) ) 
310+ styleAdapter. load_state_dict ( 
311+  torch. load ( " /home/liu/workspace/T2I-Adapter/models/t2iadapter_style_sd14v1.pth " ) ) 
300312let  state_dict  =  styleAdapter. state_dict ( ) 
301313print ( state_dict. keys ( ) ) 
302314let  ret  =  styleAdapter ( style) 
303315print ( ret) 
304316
305- let  styleEmbed  =  try   Tensor < Float > ( numpy:  state_dict [ " style_embedding " ] . type ( torch. float) . cpu ( ) . numpy ( ) ) 
317+ let  styleEmbed  =  try   Tensor < Float > ( 
318+  numpy:  state_dict [ " style_embedding " ] . type ( torch. float) . cpu ( ) . numpy ( ) ) 
306319
307320let  graph  =  DynamicGraph ( ) 
308321let  hintTensor  =  graph. variable ( try !   Tensor < Float > ( numpy:  hint. numpy ( ) ) ) . toGPU ( 0 ) 
309322let  styleTensor  =  graph. variable ( try !   Tensor < Float > ( numpy:  style. numpy ( ) ) ) . toGPU ( 0 ) 
310323// let (reader, adapternet) = Adapter(channels: [320, 640, 1280, 1280], numRepeat: 2)
311324// let (reader, adapternet) = AdapterLight(channels: [320, 640, 1280, 1280], numRepeat: 4)
312- let  ( reader,  styleadapternet)  =  StyleAdapter ( width:  1024 ,  outputDim:  768 ,  layers:  3 ,  heads:  8 ,  tokens:  8 ,  batchSize:  1 ) 
325+ let  ( reader,  styleadapternet)  =  StyleAdapter ( 
326+  width:  1024 ,  outputDim:  768 ,  layers:  3 ,  heads:  8 ,  tokens:  8 ,  batchSize:  1 ) 
313327graph. workspaceSize =  1_024  *  1_024  *  1_024 
314328graph. withNoGrad  { 
315329 // let hintIn = hintTensor.reshaped(format: .NCHW, shape: [2, 3, 64, 8, 64, 8]).permuted(0, 1, 3, 5, 2, 4).copied().reshaped(.NCHW(2, 64 * 3, 64, 64))
0 commit comments